1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "vpx_config.h" 12#include "vp8_rtcd.h" 13#include "vpx_ports/mem.h" 14#include "filter_x86.h" 15 16extern const short vp8_six_tap_x86[8][6 * 8]; 17 18extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr, 19 unsigned short *output_ptr, 20 unsigned int src_pixels_per_line, 21 unsigned int pixel_step, 22 unsigned int output_height, 23 unsigned int output_width, 24 const short *vp8_filter); 25extern void vp8_filter_block1dc_v6_mmx( 26 unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch, 27 unsigned int pixels_per_line, unsigned int pixel_step, 28 unsigned int output_height, unsigned int output_width, 29 const short *vp8_filter); 30extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr, 31 unsigned short *output_ptr, 32 unsigned int src_pixels_per_line, 33 unsigned int pixel_step, 34 unsigned int output_height, 35 unsigned int output_width, 36 const short *vp8_filter); 37extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr, 38 unsigned short *output_ptr, 39 unsigned int src_pixels_per_line, 40 unsigned int pixel_step, 41 unsigned int output_height, 42 unsigned int output_width, 43 const short *vp8_filter); 44extern void vp8_filter_block1d8_v6_sse2( 45 unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich, 46 unsigned int pixels_per_line, unsigned int pixel_step, 47 unsigned int output_height, unsigned int output_width, 48 const short *vp8_filter); 49extern void vp8_filter_block1d16_v6_sse2( 50 unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich, 51 unsigned int pixels_per_line, unsigned int pixel_step, 52 unsigned int output_height, unsigned int output_width, 53 const short *vp8_filter); 54extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr, 55 unsigned short *output_ptr, 56 unsigned int src_pixels_per_line, 57 unsigned int output_height, 58 unsigned int output_width); 59extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr, 60 unsigned int src_pixels_per_line, 61 unsigned char *output_ptr, 62 int dst_ptich, 63 unsigned int output_height, 64 const short *vp8_filter); 65extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr, 66 unsigned int src_pixels_per_line, 67 unsigned char *output_ptr, 68 int dst_ptich, 69 unsigned int output_height, 70 const short *vp8_filter); 71extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr, 72 unsigned int src_pixels_per_line, 73 unsigned char *output_ptr, 74 int dst_ptich, 75 unsigned int output_height, 76 const short *vp8_filter); 77 78#if HAVE_MMX 79void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, 80 int xoffset, int yoffset, unsigned char *dst_ptr, 81 int dst_pitch) { 82 DECLARE_ALIGNED(16, unsigned short, 83 FData2[16 * 16]); /* Temp data bufffer used in filtering */ 84 const short *HFilter, *VFilter; 85 HFilter = vp8_six_tap_x86[xoffset]; 86 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, 87 src_pixels_per_line, 1, 9, 8, HFilter); 88 VFilter = vp8_six_tap_x86[yoffset]; 89 vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4, 90 VFilter); 91} 92#endif 93 94#if HAVE_SSE2 95void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, 96 int src_pixels_per_line, int xoffset, 97 int yoffset, unsigned char *dst_ptr, 98 int dst_pitch 99 100 ) { 101 DECLARE_ALIGNED(16, unsigned short, 102 FData2[24 * 24]); /* Temp data bufffer used in filtering */ 103 104 const short *HFilter, *VFilter; 105 106 if (xoffset) { 107 if (yoffset) { 108 HFilter = vp8_six_tap_x86[xoffset]; 109 vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, 110 src_pixels_per_line, 1, 21, 32, HFilter); 111 VFilter = vp8_six_tap_x86[yoffset]; 112 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, 113 dst_pitch, VFilter); 114 } else { 115 /* First-pass only */ 116 HFilter = vp8_six_tap_x86[xoffset]; 117 vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, 118 dst_pitch, 16, HFilter); 119 } 120 } else { 121 /* Second-pass only */ 122 VFilter = vp8_six_tap_x86[yoffset]; 123 vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, 124 src_pixels_per_line, 21, 32); 125 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, 126 dst_pitch, VFilter); 127 } 128} 129 130void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, 131 int xoffset, int yoffset, 132 unsigned char *dst_ptr, int dst_pitch) { 133 DECLARE_ALIGNED(16, unsigned short, 134 FData2[256]); /* Temp data bufffer used in filtering */ 135 const short *HFilter, *VFilter; 136 137 if (xoffset) { 138 if (yoffset) { 139 HFilter = vp8_six_tap_x86[xoffset]; 140 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, 141 src_pixels_per_line, 1, 13, 16, HFilter); 142 VFilter = vp8_six_tap_x86[yoffset]; 143 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8, 144 dst_pitch, VFilter); 145 } else { 146 /* First-pass only */ 147 HFilter = vp8_six_tap_x86[xoffset]; 148 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, 149 dst_pitch, 8, HFilter); 150 } 151 } else { 152 /* Second-pass only */ 153 VFilter = vp8_six_tap_x86[yoffset]; 154 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), 155 src_pixels_per_line, dst_ptr, dst_pitch, 8, 156 VFilter); 157 } 158} 159 160void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, 161 int xoffset, int yoffset, 162 unsigned char *dst_ptr, int dst_pitch) { 163 DECLARE_ALIGNED(16, unsigned short, 164 FData2[256]); /* Temp data bufffer used in filtering */ 165 const short *HFilter, *VFilter; 166 167 if (xoffset) { 168 if (yoffset) { 169 HFilter = vp8_six_tap_x86[xoffset]; 170 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, 171 src_pixels_per_line, 1, 9, 16, HFilter); 172 VFilter = vp8_six_tap_x86[yoffset]; 173 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4, 174 dst_pitch, VFilter); 175 } else { 176 /* First-pass only */ 177 HFilter = vp8_six_tap_x86[xoffset]; 178 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, 179 dst_pitch, 4, HFilter); 180 } 181 } else { 182 /* Second-pass only */ 183 VFilter = vp8_six_tap_x86[yoffset]; 184 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), 185 src_pixels_per_line, dst_ptr, dst_pitch, 4, 186 VFilter); 187 } 188} 189 190#endif 191 192#if HAVE_SSSE3 193 194extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr, 195 unsigned int src_pixels_per_line, 196 unsigned char *output_ptr, 197 unsigned int output_pitch, 198 unsigned int output_height, 199 unsigned int vp8_filter_index); 200 201extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr, 202 unsigned int src_pixels_per_line, 203 unsigned char *output_ptr, 204 unsigned int output_pitch, 205 unsigned int output_height, 206 unsigned int vp8_filter_index); 207 208extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr, 209 unsigned int src_pitch, 210 unsigned char *output_ptr, 211 unsigned int out_pitch, 212 unsigned int output_height, 213 unsigned int vp8_filter_index); 214 215extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr, 216 unsigned int src_pitch, 217 unsigned char *output_ptr, 218 unsigned int out_pitch, 219 unsigned int output_height, 220 unsigned int vp8_filter_index); 221 222extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr, 223 unsigned int src_pixels_per_line, 224 unsigned char *output_ptr, 225 unsigned int output_pitch, 226 unsigned int output_height, 227 unsigned int vp8_filter_index); 228 229extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr, 230 unsigned int src_pitch, 231 unsigned char *output_ptr, 232 unsigned int out_pitch, 233 unsigned int output_height, 234 unsigned int vp8_filter_index); 235 236void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, 237 int src_pixels_per_line, int xoffset, 238 int yoffset, unsigned char *dst_ptr, 239 int dst_pitch 240 241 ) { 242 DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]); 243 244 if (xoffset) { 245 if (yoffset) { 246 vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 247 src_pixels_per_line, FData2, 16, 21, 248 xoffset); 249 vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16, 250 yoffset); 251 } else { 252 /* First-pass only */ 253 vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, 254 dst_pitch, 16, xoffset); 255 } 256 } else { 257 if (yoffset) { 258 /* Second-pass only */ 259 vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 260 src_pixels_per_line, dst_ptr, dst_pitch, 16, 261 yoffset); 262 } else { 263 /* ssse3 second-pass only function couldn't handle (xoffset==0 && 264 * yoffset==0) case correctly. Add copy function here to guarantee 265 * six-tap function handles all possible offsets. */ 266 vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); 267 } 268 } 269} 270 271void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, 272 int src_pixels_per_line, int xoffset, 273 int yoffset, unsigned char *dst_ptr, 274 int dst_pitch) { 275 DECLARE_ALIGNED(16, unsigned char, FData2[256]); 276 277 if (xoffset) { 278 if (yoffset) { 279 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 280 src_pixels_per_line, FData2, 8, 13, xoffset); 281 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset); 282 } else { 283 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, 284 dst_pitch, 8, xoffset); 285 } 286 } else { 287 if (yoffset) { 288 /* Second-pass only */ 289 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 290 src_pixels_per_line, dst_ptr, dst_pitch, 8, 291 yoffset); 292 } else { 293 /* ssse3 second-pass only function couldn't handle (xoffset==0 && 294 * yoffset==0) case correctly. Add copy function here to guarantee 295 * six-tap function handles all possible offsets. */ 296 vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); 297 } 298 } 299} 300 301void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, 302 int src_pixels_per_line, int xoffset, 303 int yoffset, unsigned char *dst_ptr, 304 int dst_pitch) { 305 DECLARE_ALIGNED(16, unsigned char, FData2[256]); 306 307 if (xoffset) { 308 if (yoffset) { 309 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 310 src_pixels_per_line, FData2, 8, 9, xoffset); 311 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset); 312 } else { 313 /* First-pass only */ 314 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, 315 dst_pitch, 4, xoffset); 316 } 317 } else { 318 if (yoffset) { 319 /* Second-pass only */ 320 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 321 src_pixels_per_line, dst_ptr, dst_pitch, 4, 322 yoffset); 323 } else { 324 /* ssse3 second-pass only function couldn't handle (xoffset==0 && 325 * yoffset==0) case correctly. Add copy function here to guarantee 326 * six-tap function handles all possible offsets. */ 327 vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); 328 } 329 } 330} 331 332void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, 333 int src_pixels_per_line, int xoffset, 334 int yoffset, unsigned char *dst_ptr, 335 int dst_pitch) { 336 DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]); 337 338 if (xoffset) { 339 if (yoffset) { 340 vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 341 src_pixels_per_line, FData2, 4, 9, xoffset); 342 vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset); 343 } else { 344 vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, 345 dst_pitch, 4, xoffset); 346 } 347 } else { 348 if (yoffset) { 349 vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 350 src_pixels_per_line, dst_ptr, dst_pitch, 4, 351 yoffset); 352 } else { 353 /* ssse3 second-pass only function couldn't handle (xoffset==0 && 354 * yoffset==0) case correctly. Add copy function here to guarantee 355 * six-tap function handles all possible offsets. */ 356 int r; 357 358 for (r = 0; r < 4; ++r) { 359 dst_ptr[0] = src_ptr[0]; 360 dst_ptr[1] = src_ptr[1]; 361 dst_ptr[2] = src_ptr[2]; 362 dst_ptr[3] = src_ptr[3]; 363 dst_ptr += dst_pitch; 364 src_ptr += src_pixels_per_line; 365 } 366 } 367 } 368} 369 370#endif 371