1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <tmmintrin.h> // SSSE3 12 13#include <string.h> 14 15#include "./vpx_dsp_rtcd.h" 16#include "vpx_dsp/vpx_filter.h" 17#include "vpx_dsp/x86/convolve.h" 18#include "vpx_dsp/x86/convolve_ssse3.h" 19#include "vpx_dsp/x86/mem_sse2.h" 20#include "vpx_dsp/x86/transpose_sse2.h" 21#include "vpx_mem/vpx_mem.h" 22#include "vpx_ports/mem.h" 23 24// These are reused by the avx2 intrinsics. 25// vpx_filter_block1d8_v8_intrin_ssse3() 26// vpx_filter_block1d8_h8_intrin_ssse3() 27// vpx_filter_block1d4_h8_intrin_ssse3() 28 29static INLINE __m128i shuffle_filter_convolve8_8_ssse3( 30 const __m128i *const s, const int16_t *const filter) { 31 __m128i f[4]; 32 shuffle_filter_ssse3(filter, f); 33 return convolve8_8_ssse3(s, f); 34} 35 36void vpx_filter_block1d4_h8_intrin_ssse3( 37 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, 38 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 39 __m128i firstFilters, secondFilters, shuffle1, shuffle2; 40 __m128i srcRegFilt1, srcRegFilt2; 41 __m128i addFilterReg64, filtersReg, srcReg; 42 unsigned int i; 43 44 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 45 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 46 filtersReg = _mm_loadu_si128((const __m128i *)filter); 47 // converting the 16 bit (short) to 8 bit (byte) and have the same data 48 // in both lanes of 128 bit register. 49 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 50 51 // duplicate only the first 16 bits in the filter into the first lane 52 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); 53 // duplicate only the third 16 bit in the filter into the first lane 54 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); 55 // duplicate only the seconds 16 bits in the filter into the second lane 56 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 57 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); 58 // duplicate only the forth 16 bits in the filter into the second lane 59 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 60 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); 61 62 // loading the local filters 63 shuffle1 = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6); 64 shuffle2 = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10); 65 66 for (i = 0; i < output_height; i++) { 67 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); 68 69 // filter the source buffer 70 srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); 71 srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); 72 73 // multiply 2 adjacent elements with the filter and add the result 74 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 75 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); 76 77 // sum the results together, saturating only on the final step 78 // the specific order of the additions prevents outranges 79 srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2); 80 81 // extract the higher half of the register 82 srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8); 83 84 // add the rounding offset early to avoid another saturated add 85 srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64); 86 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); 87 88 // shift by 7 bit each 16 bits 89 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 90 91 // shrink to 8 bit each 16 bits 92 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); 93 src_ptr += src_pitch; 94 95 // save only 4 bytes 96 *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); 97 98 output_ptr += output_pitch; 99 } 100} 101 102void vpx_filter_block1d8_h8_intrin_ssse3( 103 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, 104 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 105 unsigned int i; 106 __m128i f[4], filt[4], s[4]; 107 108 shuffle_filter_ssse3(filter, f); 109 filt[0] = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); 110 filt[1] = _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); 111 filt[2] = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12); 112 filt[3] = 113 _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14); 114 115 for (i = 0; i < output_height; i++) { 116 const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); 117 118 // filter the source buffer 119 s[0] = _mm_shuffle_epi8(srcReg, filt[0]); 120 s[1] = _mm_shuffle_epi8(srcReg, filt[1]); 121 s[2] = _mm_shuffle_epi8(srcReg, filt[2]); 122 s[3] = _mm_shuffle_epi8(srcReg, filt[3]); 123 s[0] = convolve8_8_ssse3(s, f); 124 125 // shrink to 8 bit each 16 bits 126 s[0] = _mm_packus_epi16(s[0], s[0]); 127 128 src_ptr += src_pitch; 129 130 // save only 8 bytes 131 _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]); 132 133 output_ptr += output_pitch; 134 } 135} 136 137void vpx_filter_block1d8_v8_intrin_ssse3( 138 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, 139 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { 140 unsigned int i; 141 __m128i f[4], s[8], ss[4]; 142 143 shuffle_filter_ssse3(filter, f); 144 145 // load the first 7 rows of 8 bytes 146 s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); 147 s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); 148 s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); 149 s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); 150 s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); 151 s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); 152 s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); 153 154 for (i = 0; i < output_height; i++) { 155 // load the last 8 bytes 156 s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); 157 158 // merge the result together 159 ss[0] = _mm_unpacklo_epi8(s[0], s[1]); 160 ss[1] = _mm_unpacklo_epi8(s[2], s[3]); 161 162 // merge the result together 163 ss[2] = _mm_unpacklo_epi8(s[4], s[5]); 164 ss[3] = _mm_unpacklo_epi8(s[6], s[7]); 165 166 ss[0] = convolve8_8_ssse3(ss, f); 167 // shrink to 8 bit each 16 bits 168 ss[0] = _mm_packus_epi16(ss[0], ss[0]); 169 170 src_ptr += src_pitch; 171 172 // shift down a row 173 s[0] = s[1]; 174 s[1] = s[2]; 175 s[2] = s[3]; 176 s[3] = s[4]; 177 s[4] = s[5]; 178 s[5] = s[6]; 179 s[6] = s[7]; 180 181 // save only 8 bytes convolve result 182 _mm_storel_epi64((__m128i *)&output_ptr[0], ss[0]); 183 184 output_ptr += out_pitch; 185 } 186} 187 188filter8_1dfunction vpx_filter_block1d16_v8_ssse3; 189filter8_1dfunction vpx_filter_block1d16_h8_ssse3; 190filter8_1dfunction vpx_filter_block1d8_v8_ssse3; 191filter8_1dfunction vpx_filter_block1d8_h8_ssse3; 192filter8_1dfunction vpx_filter_block1d4_v8_ssse3; 193filter8_1dfunction vpx_filter_block1d4_h8_ssse3; 194filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; 195filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; 196filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; 197filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; 198filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; 199filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; 200 201filter8_1dfunction vpx_filter_block1d16_v2_ssse3; 202filter8_1dfunction vpx_filter_block1d16_h2_ssse3; 203filter8_1dfunction vpx_filter_block1d8_v2_ssse3; 204filter8_1dfunction vpx_filter_block1d8_h2_ssse3; 205filter8_1dfunction vpx_filter_block1d4_v2_ssse3; 206filter8_1dfunction vpx_filter_block1d4_h2_ssse3; 207filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3; 208filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3; 209filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3; 210filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3; 211filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3; 212filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; 213 214// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 215// uint8_t *dst, ptrdiff_t dst_stride, 216// const InterpKernel *filter, int x0_q4, 217// int32_t x_step_q4, int y0_q4, int y_step_q4, 218// int w, int h); 219// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 220// uint8_t *dst, ptrdiff_t dst_stride, 221// const InterpKernel *filter, int x0_q4, 222// int32_t x_step_q4, int y0_q4, int y_step_q4, 223// int w, int h); 224// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 225// uint8_t *dst, ptrdiff_t dst_stride, 226// const InterpKernel *filter, int x0_q4, 227// int32_t x_step_q4, int y0_q4, 228// int y_step_q4, int w, int h); 229// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 230// uint8_t *dst, ptrdiff_t dst_stride, 231// const InterpKernel *filter, int x0_q4, 232// int32_t x_step_q4, int y0_q4, 233// int y_step_q4, int w, int h); 234FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3); 235FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3); 236FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3); 237FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3); 238 239static void filter_horiz_w8_ssse3(const uint8_t *const src, 240 const ptrdiff_t src_stride, 241 uint8_t *const dst, 242 const int16_t *const x_filter) { 243 __m128i s[8], ss[4], temp; 244 245 load_8bit_8x8(src, src_stride, s); 246 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 247 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 248 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 249 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 250 transpose_16bit_4x8(s, ss); 251 temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter); 252 // shrink to 8 bit each 16 bits 253 temp = _mm_packus_epi16(temp, temp); 254 // save only 8 bytes convolve result 255 _mm_storel_epi64((__m128i *)dst, temp); 256} 257 258static void transpose8x8_to_dst(const uint8_t *const src, 259 const ptrdiff_t src_stride, uint8_t *const dst, 260 const ptrdiff_t dst_stride) { 261 __m128i s[8]; 262 263 load_8bit_8x8(src, src_stride, s); 264 transpose_8bit_8x8(s, s); 265 store_8bit_8x8(s, dst, dst_stride); 266} 267 268static void scaledconvolve_horiz_w8(const uint8_t *src, 269 const ptrdiff_t src_stride, uint8_t *dst, 270 const ptrdiff_t dst_stride, 271 const InterpKernel *const x_filters, 272 const int x0_q4, const int x_step_q4, 273 const int w, const int h) { 274 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); 275 int x, y, z; 276 src -= SUBPEL_TAPS / 2 - 1; 277 278 // This function processes 8x8 areas. The intermediate height is not always 279 // a multiple of 8, so force it to be a multiple of 8 here. 280 y = h + (8 - (h & 0x7)); 281 282 do { 283 int x_q4 = x0_q4; 284 for (x = 0; x < w; x += 8) { 285 // process 8 src_x steps 286 for (z = 0; z < 8; ++z) { 287 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 288 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 289 if (x_q4 & SUBPEL_MASK) { 290 filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); 291 } else { 292 int i; 293 for (i = 0; i < 8; ++i) { 294 temp[z * 8 + i] = src_x[i * src_stride + 3]; 295 } 296 } 297 x_q4 += x_step_q4; 298 } 299 300 // transpose the 8x8 filters values back to dst 301 transpose8x8_to_dst(temp, 8, dst + x, dst_stride); 302 } 303 304 src += src_stride * 8; 305 dst += dst_stride * 8; 306 } while (y -= 8); 307} 308 309static void filter_horiz_w4_ssse3(const uint8_t *const src, 310 const ptrdiff_t src_stride, 311 uint8_t *const dst, 312 const int16_t *const filter) { 313 __m128i s[4], ss[2]; 314 __m128i temp; 315 316 load_8bit_8x4(src, src_stride, s); 317 transpose_16bit_4x4(s, ss); 318 // 00 01 10 11 20 21 30 31 319 s[0] = ss[0]; 320 // 02 03 12 13 22 23 32 33 321 s[1] = _mm_srli_si128(ss[0], 8); 322 // 04 05 14 15 24 25 34 35 323 s[2] = ss[1]; 324 // 06 07 16 17 26 27 36 37 325 s[3] = _mm_srli_si128(ss[1], 8); 326 327 temp = shuffle_filter_convolve8_8_ssse3(s, filter); 328 // shrink to 8 bit each 16 bits 329 temp = _mm_packus_epi16(temp, temp); 330 // save only 4 bytes 331 *(int *)dst = _mm_cvtsi128_si32(temp); 332} 333 334static void transpose4x4_to_dst(const uint8_t *const src, 335 const ptrdiff_t src_stride, uint8_t *const dst, 336 const ptrdiff_t dst_stride) { 337 __m128i s[4]; 338 339 load_8bit_4x4(src, src_stride, s); 340 s[0] = transpose_8bit_4x4(s); 341 s[1] = _mm_srli_si128(s[0], 4); 342 s[2] = _mm_srli_si128(s[0], 8); 343 s[3] = _mm_srli_si128(s[0], 12); 344 store_8bit_4x4(s, dst, dst_stride); 345} 346 347static void scaledconvolve_horiz_w4(const uint8_t *src, 348 const ptrdiff_t src_stride, uint8_t *dst, 349 const ptrdiff_t dst_stride, 350 const InterpKernel *const x_filters, 351 const int x0_q4, const int x_step_q4, 352 const int w, const int h) { 353 DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); 354 int x, y, z; 355 src -= SUBPEL_TAPS / 2 - 1; 356 357 for (y = 0; y < h; y += 4) { 358 int x_q4 = x0_q4; 359 for (x = 0; x < w; x += 4) { 360 // process 4 src_x steps 361 for (z = 0; z < 4; ++z) { 362 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 363 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 364 if (x_q4 & SUBPEL_MASK) { 365 filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); 366 } else { 367 int i; 368 for (i = 0; i < 4; ++i) { 369 temp[z * 4 + i] = src_x[i * src_stride + 3]; 370 } 371 } 372 x_q4 += x_step_q4; 373 } 374 375 // transpose the 4x4 filters values back to dst 376 transpose4x4_to_dst(temp, 4, dst + x, dst_stride); 377 } 378 379 src += src_stride * 4; 380 dst += dst_stride * 4; 381 } 382} 383 384static __m128i filter_vert_kernel(const __m128i *const s, 385 const int16_t *const filter) { 386 __m128i ss[4]; 387 __m128i temp; 388 389 // 00 10 01 11 02 12 03 13 390 ss[0] = _mm_unpacklo_epi8(s[0], s[1]); 391 // 20 30 21 31 22 32 23 33 392 ss[1] = _mm_unpacklo_epi8(s[2], s[3]); 393 // 40 50 41 51 42 52 43 53 394 ss[2] = _mm_unpacklo_epi8(s[4], s[5]); 395 // 60 70 61 71 62 72 63 73 396 ss[3] = _mm_unpacklo_epi8(s[6], s[7]); 397 398 temp = shuffle_filter_convolve8_8_ssse3(ss, filter); 399 // shrink to 8 bit each 16 bits 400 return _mm_packus_epi16(temp, temp); 401} 402 403static void filter_vert_w4_ssse3(const uint8_t *const src, 404 const ptrdiff_t src_stride, uint8_t *const dst, 405 const int16_t *const filter) { 406 __m128i s[8]; 407 __m128i temp; 408 409 load_8bit_4x8(src, src_stride, s); 410 temp = filter_vert_kernel(s, filter); 411 // save only 4 bytes 412 *(int *)dst = _mm_cvtsi128_si32(temp); 413} 414 415static void scaledconvolve_vert_w4( 416 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, 417 const ptrdiff_t dst_stride, const InterpKernel *const y_filters, 418 const int y0_q4, const int y_step_q4, const int w, const int h) { 419 int y; 420 int y_q4 = y0_q4; 421 422 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 423 for (y = 0; y < h; ++y) { 424 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 425 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 426 427 if (y_q4 & SUBPEL_MASK) { 428 filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); 429 } else { 430 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); 431 } 432 433 y_q4 += y_step_q4; 434 } 435} 436 437static void filter_vert_w8_ssse3(const uint8_t *const src, 438 const ptrdiff_t src_stride, uint8_t *const dst, 439 const int16_t *const filter) { 440 __m128i s[8], temp; 441 442 load_8bit_8x8(src, src_stride, s); 443 temp = filter_vert_kernel(s, filter); 444 // save only 8 bytes convolve result 445 _mm_storel_epi64((__m128i *)dst, temp); 446} 447 448static void scaledconvolve_vert_w8( 449 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, 450 const ptrdiff_t dst_stride, const InterpKernel *const y_filters, 451 const int y0_q4, const int y_step_q4, const int w, const int h) { 452 int y; 453 int y_q4 = y0_q4; 454 455 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 456 for (y = 0; y < h; ++y) { 457 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 458 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 459 if (y_q4 & SUBPEL_MASK) { 460 filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); 461 } else { 462 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); 463 } 464 y_q4 += y_step_q4; 465 } 466} 467 468static void filter_vert_w16_ssse3(const uint8_t *src, 469 const ptrdiff_t src_stride, 470 uint8_t *const dst, 471 const int16_t *const filter, const int w) { 472 int i; 473 __m128i f[4]; 474 shuffle_filter_ssse3(filter, f); 475 476 for (i = 0; i < w; i += 16) { 477 __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi; 478 479 loadu_8bit_16x8(src, src_stride, s); 480 481 // merge the result together 482 s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]); 483 s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]); 484 s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]); 485 s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]); 486 s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]); 487 s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]); 488 s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]); 489 s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]); 490 temp_lo = convolve8_8_ssse3(s_lo, f); 491 temp_hi = convolve8_8_ssse3(s_hi, f); 492 493 // shrink to 8 bit each 16 bits, the first lane contain the first convolve 494 // result and the second lane contain the second convolve result 495 temp_hi = _mm_packus_epi16(temp_lo, temp_hi); 496 src += 16; 497 // save 16 bytes convolve result 498 _mm_store_si128((__m128i *)&dst[i], temp_hi); 499 } 500} 501 502static void scaledconvolve_vert_w16( 503 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, 504 const ptrdiff_t dst_stride, const InterpKernel *const y_filters, 505 const int y0_q4, const int y_step_q4, const int w, const int h) { 506 int y; 507 int y_q4 = y0_q4; 508 509 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 510 for (y = 0; y < h; ++y) { 511 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 512 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 513 if (y_q4 & SUBPEL_MASK) { 514 filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, 515 w); 516 } else { 517 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); 518 } 519 y_q4 += y_step_q4; 520 } 521} 522 523void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 524 ptrdiff_t dst_stride, const InterpKernel *filter, 525 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, 526 int w, int h) { 527 // Note: Fixed size intermediate buffer, temp, places limits on parameters. 528 // 2d filtering proceeds in 2 steps: 529 // (1) Interpolate horizontally into an intermediate buffer, temp. 530 // (2) Interpolate temp vertically to derive the sub-pixel result. 531 // Deriving the maximum number of rows in the temp buffer (135): 532 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). 533 // --Largest block size is 64x64 pixels. 534 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the 535 // original frame (in 1/16th pixel units). 536 // --Must round-up because block may be located at sub-pixel position. 537 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. 538 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. 539 // --Require an additional 8 rows for the horiz_w8 transpose tail. 540 // When calling in frame scaling function, the smallest scaling factor is x1/4 541 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still 542 // big enough. 543 DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); 544 const int intermediate_height = 545 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; 546 547 assert(w <= 64); 548 assert(h <= 64); 549 assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); 550 assert(x_step_q4 <= 64); 551 552 if (w >= 8) { 553 scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), 554 src_stride, temp, 64, filter, x0_q4, x_step_q4, w, 555 intermediate_height); 556 } else { 557 scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), 558 src_stride, temp, 64, filter, x0_q4, x_step_q4, w, 559 intermediate_height); 560 } 561 562 if (w >= 16) { 563 scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 564 dst_stride, filter, y0_q4, y_step_q4, w, h); 565 } else if (w == 8) { 566 scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 567 dst_stride, filter, y0_q4, y_step_q4, w, h); 568 } else { 569 scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 570 dst_stride, filter, y0_q4, y_step_q4, w, h); 571 } 572} 573 574// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, 575// uint8_t *dst, ptrdiff_t dst_stride, 576// const InterpKernel *filter, int x0_q4, 577// int32_t x_step_q4, int y0_q4, int y_step_q4, 578// int w, int h); 579// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, 580// uint8_t *dst, ptrdiff_t dst_stride, 581// const InterpKernel *filter, int x0_q4, 582// int32_t x_step_q4, int y0_q4, int y_step_q4, 583// int w, int h); 584FUN_CONV_2D(, ssse3); 585FUN_CONV_2D(avg_, ssse3); 586