1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12 13#include "./vpx_config.h" 14#include "./vp9_rtcd.h" 15#include "vpx_ports/mem.h" 16 17typedef void filter8_1dfunction ( 18 const unsigned char *src_ptr, 19 const ptrdiff_t src_pitch, 20 unsigned char *output_ptr, 21 ptrdiff_t out_pitch, 22 unsigned int output_height, 23 const short *filter 24); 25 26#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ 27 void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ 28 uint8_t *dst, ptrdiff_t dst_stride, \ 29 const int16_t *filter_x, int x_step_q4, \ 30 const int16_t *filter_y, int y_step_q4, \ 31 int w, int h) { \ 32 if (step_q4 == 16 && filter[3] != 128) { \ 33 if (filter[0] || filter[1] || filter[2]) { \ 34 while (w >= 16) { \ 35 vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ 36 src_stride, \ 37 dst, \ 38 dst_stride, \ 39 h, \ 40 filter); \ 41 src += 16; \ 42 dst += 16; \ 43 w -= 16; \ 44 } \ 45 while (w >= 8) { \ 46 vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ 47 src_stride, \ 48 dst, \ 49 dst_stride, \ 50 h, \ 51 filter); \ 52 src += 8; \ 53 dst += 8; \ 54 w -= 8; \ 55 } \ 56 while (w >= 4) { \ 57 vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ 58 src_stride, \ 59 dst, \ 60 dst_stride, \ 61 h, \ 62 filter); \ 63 src += 4; \ 64 dst += 4; \ 65 w -= 4; \ 66 } \ 67 } else { \ 68 while (w >= 16) { \ 69 vp9_filter_block1d16_##dir##2_##avg##opt(src, \ 70 src_stride, \ 71 dst, \ 72 dst_stride, \ 73 h, \ 74 filter); \ 75 src += 16; \ 76 dst += 16; \ 77 w -= 16; \ 78 } \ 79 while (w >= 8) { \ 80 vp9_filter_block1d8_##dir##2_##avg##opt(src, \ 81 src_stride, \ 82 dst, \ 83 dst_stride, \ 84 h, \ 85 filter); \ 86 src += 8; \ 87 dst += 8; \ 88 w -= 8; \ 89 } \ 90 while (w >= 4) { \ 91 vp9_filter_block1d4_##dir##2_##avg##opt(src, \ 92 src_stride, \ 93 dst, \ 94 dst_stride, \ 95 h, \ 96 filter); \ 97 src += 4; \ 98 dst += 4; \ 99 w -= 4; \ 100 } \ 101 } \ 102 } \ 103 if (w) { \ 104 vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ 105 filter_x, x_step_q4, filter_y, y_step_q4, \ 106 w, h); \ 107 } \ 108} 109 110#define FUN_CONV_2D(avg, opt) \ 111void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ 112 uint8_t *dst, ptrdiff_t dst_stride, \ 113 const int16_t *filter_x, int x_step_q4, \ 114 const int16_t *filter_y, int y_step_q4, \ 115 int w, int h) { \ 116 assert(w <= 64); \ 117 assert(h <= 64); \ 118 if (x_step_q4 == 16 && y_step_q4 == 16) { \ 119 if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ 120 filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ 121 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \ 122 vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ 123 filter_x, x_step_q4, filter_y, y_step_q4, \ 124 w, h + 7); \ 125 vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ 126 filter_x, x_step_q4, filter_y, \ 127 y_step_q4, w, h); \ 128 } else { \ 129 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \ 130 vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ 131 filter_x, x_step_q4, filter_y, y_step_q4, \ 132 w, h + 1); \ 133 vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ 134 filter_x, x_step_q4, filter_y, \ 135 y_step_q4, w, h); \ 136 } \ 137 } else { \ 138 vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ 139 filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ 140 } \ 141} 142#if HAVE_AVX2 143filter8_1dfunction vp9_filter_block1d16_v8_avx2; 144filter8_1dfunction vp9_filter_block1d16_h8_avx2; 145filter8_1dfunction vp9_filter_block1d4_v8_ssse3; 146#if (ARCH_X86_64) 147filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; 148filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; 149filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; 150#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 151#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 152#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 153#else 154filter8_1dfunction vp9_filter_block1d8_v8_ssse3; 155filter8_1dfunction vp9_filter_block1d8_h8_ssse3; 156filter8_1dfunction vp9_filter_block1d4_h8_ssse3; 157#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 158#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 159#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 160#endif 161filter8_1dfunction vp9_filter_block1d16_v2_ssse3; 162filter8_1dfunction vp9_filter_block1d16_h2_ssse3; 163filter8_1dfunction vp9_filter_block1d8_v2_ssse3; 164filter8_1dfunction vp9_filter_block1d8_h2_ssse3; 165filter8_1dfunction vp9_filter_block1d4_v2_ssse3; 166filter8_1dfunction vp9_filter_block1d4_h2_ssse3; 167#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 168#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 169#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 170#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 171#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 172#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 173#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 174// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, 175// uint8_t *dst, ptrdiff_t dst_stride, 176// const int16_t *filter_x, int x_step_q4, 177// const int16_t *filter_y, int y_step_q4, 178// int w, int h); 179// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, 180// uint8_t *dst, ptrdiff_t dst_stride, 181// const int16_t *filter_x, int x_step_q4, 182// const int16_t *filter_y, int y_step_q4, 183// int w, int h); 184FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); 185FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); 186 187// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, 188// uint8_t *dst, ptrdiff_t dst_stride, 189// const int16_t *filter_x, int x_step_q4, 190// const int16_t *filter_y, int y_step_q4, 191// int w, int h); 192FUN_CONV_2D(, avx2); 193#endif 194#if HAVE_SSSE3 195#if (ARCH_X86_64) 196filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; 197filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; 198filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; 199filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; 200filter8_1dfunction vp9_filter_block1d4_v8_ssse3; 201filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; 202#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 203#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 204#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 205#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 206#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 207#else 208filter8_1dfunction vp9_filter_block1d16_v8_ssse3; 209filter8_1dfunction vp9_filter_block1d16_h8_ssse3; 210filter8_1dfunction vp9_filter_block1d8_v8_ssse3; 211filter8_1dfunction vp9_filter_block1d8_h8_ssse3; 212filter8_1dfunction vp9_filter_block1d4_v8_ssse3; 213filter8_1dfunction vp9_filter_block1d4_h8_ssse3; 214#endif 215filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; 216filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; 217filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; 218filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; 219filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; 220filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; 221 222filter8_1dfunction vp9_filter_block1d16_v2_ssse3; 223filter8_1dfunction vp9_filter_block1d16_h2_ssse3; 224filter8_1dfunction vp9_filter_block1d8_v2_ssse3; 225filter8_1dfunction vp9_filter_block1d8_h2_ssse3; 226filter8_1dfunction vp9_filter_block1d4_v2_ssse3; 227filter8_1dfunction vp9_filter_block1d4_h2_ssse3; 228filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; 229filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; 230filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; 231filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; 232filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; 233filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; 234 235// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 236// uint8_t *dst, ptrdiff_t dst_stride, 237// const int16_t *filter_x, int x_step_q4, 238// const int16_t *filter_y, int y_step_q4, 239// int w, int h); 240// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 241// uint8_t *dst, ptrdiff_t dst_stride, 242// const int16_t *filter_x, int x_step_q4, 243// const int16_t *filter_y, int y_step_q4, 244// int w, int h); 245// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 246// uint8_t *dst, ptrdiff_t dst_stride, 247// const int16_t *filter_x, int x_step_q4, 248// const int16_t *filter_y, int y_step_q4, 249// int w, int h); 250// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 251// uint8_t *dst, ptrdiff_t dst_stride, 252// const int16_t *filter_x, int x_step_q4, 253// const int16_t *filter_y, int y_step_q4, 254// int w, int h); 255FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); 256FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); 257FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); 258FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, 259 ssse3); 260 261// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, 262// uint8_t *dst, ptrdiff_t dst_stride, 263// const int16_t *filter_x, int x_step_q4, 264// const int16_t *filter_y, int y_step_q4, 265// int w, int h); 266// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, 267// uint8_t *dst, ptrdiff_t dst_stride, 268// const int16_t *filter_x, int x_step_q4, 269// const int16_t *filter_y, int y_step_q4, 270// int w, int h); 271FUN_CONV_2D(, ssse3); 272FUN_CONV_2D(avg_ , ssse3); 273#endif 274 275#if HAVE_SSE2 276filter8_1dfunction vp9_filter_block1d16_v8_sse2; 277filter8_1dfunction vp9_filter_block1d16_h8_sse2; 278filter8_1dfunction vp9_filter_block1d8_v8_sse2; 279filter8_1dfunction vp9_filter_block1d8_h8_sse2; 280filter8_1dfunction vp9_filter_block1d4_v8_sse2; 281filter8_1dfunction vp9_filter_block1d4_h8_sse2; 282filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; 283filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; 284filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; 285filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; 286filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; 287filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; 288 289filter8_1dfunction vp9_filter_block1d16_v2_sse2; 290filter8_1dfunction vp9_filter_block1d16_h2_sse2; 291filter8_1dfunction vp9_filter_block1d8_v2_sse2; 292filter8_1dfunction vp9_filter_block1d8_h2_sse2; 293filter8_1dfunction vp9_filter_block1d4_v2_sse2; 294filter8_1dfunction vp9_filter_block1d4_h2_sse2; 295filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; 296filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; 297filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; 298filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; 299filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; 300filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; 301 302// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, 303// uint8_t *dst, ptrdiff_t dst_stride, 304// const int16_t *filter_x, int x_step_q4, 305// const int16_t *filter_y, int y_step_q4, 306// int w, int h); 307// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, 308// uint8_t *dst, ptrdiff_t dst_stride, 309// const int16_t *filter_x, int x_step_q4, 310// const int16_t *filter_y, int y_step_q4, 311// int w, int h); 312// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, 313// uint8_t *dst, ptrdiff_t dst_stride, 314// const int16_t *filter_x, int x_step_q4, 315// const int16_t *filter_y, int y_step_q4, 316// int w, int h); 317// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, 318// uint8_t *dst, ptrdiff_t dst_stride, 319// const int16_t *filter_x, int x_step_q4, 320// const int16_t *filter_y, int y_step_q4, 321// int w, int h); 322FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); 323FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); 324FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); 325FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); 326 327// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, 328// uint8_t *dst, ptrdiff_t dst_stride, 329// const int16_t *filter_x, int x_step_q4, 330// const int16_t *filter_y, int y_step_q4, 331// int w, int h); 332// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, 333// uint8_t *dst, ptrdiff_t dst_stride, 334// const int16_t *filter_x, int x_step_q4, 335// const int16_t *filter_y, int y_step_q4, 336// int w, int h); 337FUN_CONV_2D(, sse2); 338FUN_CONV_2D(avg_ , sse2); 339#endif 340