vpx_subpixel_8t_intrin_avx2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11// Due to a header conflict between math.h and intrinsics includes with ceil() 12// in certain configurations under vs9 this include needs to precede 13// immintrin.h. 14 15#include <immintrin.h> 16 17#include "./vpx_dsp_rtcd.h" 18#include "vpx_dsp/x86/convolve.h" 19#include "vpx_ports/mem.h" 20 21// filters for 16_h8 and 16_v8 22DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { 23 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 24 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 25}; 26 27DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { 28 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 29 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 30}; 31 32DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { 33 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 34 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 35}; 36 37DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 38 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 39 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 40}; 41 42#if defined(__clang__) 43# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \ 44 (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0) 45# define MM256_BROADCASTSI128_SI256(x) \ 46 _mm_broadcastsi128_si256((__m128i const *)&(x)) 47# else // clang > 3.3, and not 5.0 on macosx. 48# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) 49# endif // clang <= 3.3 50#elif defined(__GNUC__) 51# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) 52# define MM256_BROADCASTSI128_SI256(x) \ 53 _mm_broadcastsi128_si256((__m128i const *)&(x)) 54# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 55# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) 56# else // gcc > 4.7 57# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) 58# endif // gcc <= 4.6 59#else // !(gcc || clang) 60# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) 61#endif // __clang__ 62 63static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, 64 ptrdiff_t src_pixels_per_line, 65 uint8_t *output_ptr, 66 ptrdiff_t output_pitch, 67 uint32_t output_height, 68 const int16_t *filter) { 69 __m128i filtersReg; 70 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; 71 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; 72 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; 73 __m256i srcReg32b1, srcReg32b2, filtersReg32; 74 unsigned int i; 75 ptrdiff_t src_stride, dst_stride; 76 77 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 78 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); 79 filtersReg = _mm_loadu_si128((const __m128i *)filter); 80 // converting the 16 bit (short) to 8 bit (byte) and have the same data 81 // in both lanes of 128 bit register. 82 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 83 // have the same data in both lanes of a 256 bit register 84 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 85 86 // duplicate only the first 16 bits (first and second byte) 87 // across 256 bit register 88 firstFilters = _mm256_shuffle_epi8(filtersReg32, 89 _mm256_set1_epi16(0x100u)); 90 // duplicate only the second 16 bits (third and forth byte) 91 // across 256 bit register 92 secondFilters = _mm256_shuffle_epi8(filtersReg32, 93 _mm256_set1_epi16(0x302u)); 94 // duplicate only the third 16 bits (fifth and sixth byte) 95 // across 256 bit register 96 thirdFilters = _mm256_shuffle_epi8(filtersReg32, 97 _mm256_set1_epi16(0x504u)); 98 // duplicate only the forth 16 bits (seventh and eighth byte) 99 // across 256 bit register 100 forthFilters = _mm256_shuffle_epi8(filtersReg32, 101 _mm256_set1_epi16(0x706u)); 102 103 filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); 104 filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); 105 filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); 106 filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); 107 108 // multiple the size of the source and destination stride by two 109 src_stride = src_pixels_per_line << 1; 110 dst_stride = output_pitch << 1; 111 for (i = output_height; i > 1; i-=2) { 112 // load the 2 strides of source 113 srcReg32b1 = _mm256_castsi128_si256( 114 _mm_loadu_si128((const __m128i *)(src_ptr - 3))); 115 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, 116 _mm_loadu_si128((const __m128i *) 117 (src_ptr+src_pixels_per_line-3)), 1); 118 119 // filter the source buffer 120 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); 121 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); 122 123 // multiply 2 adjacent elements with the filter and add the result 124 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); 125 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); 126 127 // add and saturate the results together 128 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); 129 130 // filter the source buffer 131 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); 132 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); 133 134 // multiply 2 adjacent elements with the filter and add the result 135 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); 136 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 137 138 // add and saturate the results together 139 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, 140 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); 141 142 // reading 2 strides of the next 16 bytes 143 // (part of it was being read by earlier read) 144 srcReg32b2 = _mm256_castsi128_si256( 145 _mm_loadu_si128((const __m128i *)(src_ptr + 5))); 146 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, 147 _mm_loadu_si128((const __m128i *) 148 (src_ptr+src_pixels_per_line+5)), 1); 149 150 // add and saturate the results together 151 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, 152 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); 153 154 // filter the source buffer 155 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); 156 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); 157 158 // multiply 2 adjacent elements with the filter and add the result 159 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); 160 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); 161 162 // add and saturate the results together 163 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); 164 165 // filter the source buffer 166 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); 167 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); 168 169 // multiply 2 adjacent elements with the filter and add the result 170 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); 171 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 172 173 // add and saturate the results together 174 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, 175 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); 176 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, 177 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); 178 179 180 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); 181 182 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); 183 184 // shift by 7 bit each 16 bit 185 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); 186 srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); 187 188 // shrink to 8 bit each 16 bits, the first lane contain the first 189 // convolve result and the second lane contain the second convolve 190 // result 191 srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, 192 srcRegFilt32b2_1); 193 194 src_ptr+=src_stride; 195 196 // save 16 bytes 197 _mm_store_si128((__m128i*)output_ptr, 198 _mm256_castsi256_si128(srcRegFilt32b1_1)); 199 200 // save the next 16 bits 201 _mm_store_si128((__m128i*)(output_ptr+output_pitch), 202 _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); 203 output_ptr+=dst_stride; 204 } 205 206 // if the number of strides is odd. 207 // process only 16 bytes 208 if (i > 0) { 209 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; 210 __m128i srcRegFilt2, srcRegFilt3; 211 212 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); 213 214 // filter the source buffer 215 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, 216 _mm256_castsi256_si128(filt1Reg)); 217 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, 218 _mm256_castsi256_si128(filt4Reg)); 219 220 // multiply 2 adjacent elements with the filter and add the result 221 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, 222 _mm256_castsi256_si128(firstFilters)); 223 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 224 _mm256_castsi256_si128(forthFilters)); 225 226 // add and saturate the results together 227 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); 228 229 // filter the source buffer 230 srcRegFilt3= _mm_shuffle_epi8(srcReg1, 231 _mm256_castsi256_si128(filt2Reg)); 232 srcRegFilt2= _mm_shuffle_epi8(srcReg1, 233 _mm256_castsi256_si128(filt3Reg)); 234 235 // multiply 2 adjacent elements with the filter and add the result 236 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, 237 _mm256_castsi256_si128(secondFilters)); 238 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 239 _mm256_castsi256_si128(thirdFilters)); 240 241 // add and saturate the results together 242 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 243 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 244 245 // reading the next 16 bytes 246 // (part of it was being read by earlier read) 247 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); 248 249 // add and saturate the results together 250 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 251 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 252 253 // filter the source buffer 254 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, 255 _mm256_castsi256_si128(filt1Reg)); 256 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, 257 _mm256_castsi256_si128(filt4Reg)); 258 259 // multiply 2 adjacent elements with the filter and add the result 260 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, 261 _mm256_castsi256_si128(firstFilters)); 262 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 263 _mm256_castsi256_si128(forthFilters)); 264 265 // add and saturate the results together 266 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); 267 268 // filter the source buffer 269 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, 270 _mm256_castsi256_si128(filt2Reg)); 271 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, 272 _mm256_castsi256_si128(filt3Reg)); 273 274 // multiply 2 adjacent elements with the filter and add the result 275 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, 276 _mm256_castsi256_si128(secondFilters)); 277 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 278 _mm256_castsi256_si128(thirdFilters)); 279 280 // add and saturate the results together 281 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 282 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 283 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 284 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 285 286 287 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 288 _mm256_castsi256_si128(addFilterReg64)); 289 290 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 291 _mm256_castsi256_si128(addFilterReg64)); 292 293 // shift by 7 bit each 16 bit 294 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); 295 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); 296 297 // shrink to 8 bit each 16 bits, the first lane contain the first 298 // convolve result and the second lane contain the second convolve 299 // result 300 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); 301 302 // save 16 bytes 303 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); 304 } 305} 306 307static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, 308 ptrdiff_t src_pitch, 309 uint8_t *output_ptr, 310 ptrdiff_t out_pitch, 311 uint32_t output_height, 312 const int16_t *filter) { 313 __m128i filtersReg; 314 __m256i addFilterReg64; 315 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; 316 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; 317 __m256i srcReg32b11, srcReg32b12, filtersReg32; 318 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; 319 unsigned int i; 320 ptrdiff_t src_stride, dst_stride; 321 322 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 323 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); 324 filtersReg = _mm_loadu_si128((const __m128i *)filter); 325 // converting the 16 bit (short) to 8 bit (byte) and have the 326 // same data in both lanes of 128 bit register. 327 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 328 // have the same data in both lanes of a 256 bit register 329 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 330 331 // duplicate only the first 16 bits (first and second byte) 332 // across 256 bit register 333 firstFilters = _mm256_shuffle_epi8(filtersReg32, 334 _mm256_set1_epi16(0x100u)); 335 // duplicate only the second 16 bits (third and forth byte) 336 // across 256 bit register 337 secondFilters = _mm256_shuffle_epi8(filtersReg32, 338 _mm256_set1_epi16(0x302u)); 339 // duplicate only the third 16 bits (fifth and sixth byte) 340 // across 256 bit register 341 thirdFilters = _mm256_shuffle_epi8(filtersReg32, 342 _mm256_set1_epi16(0x504u)); 343 // duplicate only the forth 16 bits (seventh and eighth byte) 344 // across 256 bit register 345 forthFilters = _mm256_shuffle_epi8(filtersReg32, 346 _mm256_set1_epi16(0x706u)); 347 348 // multiple the size of the source and destination stride by two 349 src_stride = src_pitch << 1; 350 dst_stride = out_pitch << 1; 351 352 // load 16 bytes 7 times in stride of src_pitch 353 srcReg32b1 = _mm256_castsi128_si256( 354 _mm_loadu_si128((const __m128i *)(src_ptr))); 355 srcReg32b2 = _mm256_castsi128_si256( 356 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); 357 srcReg32b3 = _mm256_castsi128_si256( 358 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); 359 srcReg32b4 = _mm256_castsi128_si256( 360 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); 361 srcReg32b5 = _mm256_castsi128_si256( 362 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); 363 srcReg32b6 = _mm256_castsi128_si256( 364 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); 365 srcReg32b7 = _mm256_castsi128_si256( 366 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); 367 368 // have each consecutive loads on the same 256 register 369 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, 370 _mm256_castsi256_si128(srcReg32b2), 1); 371 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, 372 _mm256_castsi256_si128(srcReg32b3), 1); 373 srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, 374 _mm256_castsi256_si128(srcReg32b4), 1); 375 srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, 376 _mm256_castsi256_si128(srcReg32b5), 1); 377 srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, 378 _mm256_castsi256_si128(srcReg32b6), 1); 379 srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, 380 _mm256_castsi256_si128(srcReg32b7), 1); 381 382 // merge every two consecutive registers except the last one 383 srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); 384 srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); 385 386 // save 387 srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); 388 389 // save 390 srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); 391 392 // save 393 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); 394 395 // save 396 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); 397 398 399 for (i = output_height; i > 1; i-=2) { 400 // load the last 2 loads of 16 bytes and have every two 401 // consecutive loads in the same 256 bit register 402 srcReg32b8 = _mm256_castsi128_si256( 403 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); 404 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, 405 _mm256_castsi256_si128(srcReg32b8), 1); 406 srcReg32b9 = _mm256_castsi128_si256( 407 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); 408 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, 409 _mm256_castsi256_si128(srcReg32b9), 1); 410 411 // merge every two consecutive registers 412 // save 413 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); 414 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); 415 416 // multiply 2 adjacent elements with the filter and add the result 417 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); 418 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); 419 420 // add and saturate the results together 421 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); 422 423 // multiply 2 adjacent elements with the filter and add the result 424 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); 425 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); 426 427 // add and saturate the results together 428 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, 429 _mm256_min_epi16(srcReg32b8, srcReg32b12)); 430 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, 431 _mm256_max_epi16(srcReg32b8, srcReg32b12)); 432 433 // multiply 2 adjacent elements with the filter and add the result 434 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); 435 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); 436 437 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); 438 439 // multiply 2 adjacent elements with the filter and add the result 440 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); 441 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); 442 443 // add and saturate the results together 444 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, 445 _mm256_min_epi16(srcReg32b8, srcReg32b12)); 446 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, 447 _mm256_max_epi16(srcReg32b8, srcReg32b12)); 448 449 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); 450 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); 451 452 // shift by 7 bit each 16 bit 453 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); 454 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); 455 456 // shrink to 8 bit each 16 bits, the first lane contain the first 457 // convolve result and the second lane contain the second convolve 458 // result 459 srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); 460 461 src_ptr+=src_stride; 462 463 // save 16 bytes 464 _mm_store_si128((__m128i*)output_ptr, 465 _mm256_castsi256_si128(srcReg32b1)); 466 467 // save the next 16 bits 468 _mm_store_si128((__m128i*)(output_ptr+out_pitch), 469 _mm256_extractf128_si256(srcReg32b1, 1)); 470 471 output_ptr+=dst_stride; 472 473 // save part of the registers for next strides 474 srcReg32b10 = srcReg32b11; 475 srcReg32b1 = srcReg32b3; 476 srcReg32b11 = srcReg32b2; 477 srcReg32b3 = srcReg32b5; 478 srcReg32b2 = srcReg32b4; 479 srcReg32b5 = srcReg32b7; 480 srcReg32b7 = srcReg32b9; 481 } 482 if (i > 0) { 483 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; 484 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; 485 // load the last 16 bytes 486 srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); 487 488 // merge the last 2 results together 489 srcRegFilt4 = _mm_unpacklo_epi8( 490 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); 491 srcRegFilt7 = _mm_unpackhi_epi8( 492 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); 493 494 // multiply 2 adjacent elements with the filter and add the result 495 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), 496 _mm256_castsi256_si128(firstFilters)); 497 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, 498 _mm256_castsi256_si128(forthFilters)); 499 srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), 500 _mm256_castsi256_si128(firstFilters)); 501 srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, 502 _mm256_castsi256_si128(forthFilters)); 503 504 // add and saturate the results together 505 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); 506 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); 507 508 509 // multiply 2 adjacent elements with the filter and add the result 510 srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), 511 _mm256_castsi256_si128(secondFilters)); 512 srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), 513 _mm256_castsi256_si128(secondFilters)); 514 515 // multiply 2 adjacent elements with the filter and add the result 516 srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), 517 _mm256_castsi256_si128(thirdFilters)); 518 srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), 519 _mm256_castsi256_si128(thirdFilters)); 520 521 // add and saturate the results together 522 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 523 _mm_min_epi16(srcRegFilt4, srcRegFilt6)); 524 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, 525 _mm_min_epi16(srcRegFilt5, srcRegFilt7)); 526 527 // add and saturate the results together 528 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 529 _mm_max_epi16(srcRegFilt4, srcRegFilt6)); 530 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, 531 _mm_max_epi16(srcRegFilt5, srcRegFilt7)); 532 533 534 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 535 _mm256_castsi256_si128(addFilterReg64)); 536 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, 537 _mm256_castsi256_si128(addFilterReg64)); 538 539 // shift by 7 bit each 16 bit 540 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 541 srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); 542 543 // shrink to 8 bit each 16 bits, the first lane contain the first 544 // convolve result and the second lane contain the second convolve 545 // result 546 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); 547 548 // save 16 bytes 549 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); 550 } 551} 552 553#if HAVE_AVX2 && HAVE_SSSE3 554filter8_1dfunction vpx_filter_block1d4_v8_ssse3; 555#if ARCH_X86_64 556filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; 557filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; 558filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; 559#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3 560#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3 561#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3 562#else // ARCH_X86 563filter8_1dfunction vpx_filter_block1d8_v8_ssse3; 564filter8_1dfunction vpx_filter_block1d8_h8_ssse3; 565filter8_1dfunction vpx_filter_block1d4_h8_ssse3; 566#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3 567#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3 568#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 569#endif // ARCH_X86_64 570filter8_1dfunction vpx_filter_block1d16_v2_ssse3; 571filter8_1dfunction vpx_filter_block1d16_h2_ssse3; 572filter8_1dfunction vpx_filter_block1d8_v2_ssse3; 573filter8_1dfunction vpx_filter_block1d8_h2_ssse3; 574filter8_1dfunction vpx_filter_block1d4_v2_ssse3; 575filter8_1dfunction vpx_filter_block1d4_h2_ssse3; 576#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3 577#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3 578#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3 579#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3 580#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3 581#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3 582#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3 583// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, 584// uint8_t *dst, ptrdiff_t dst_stride, 585// const int16_t *filter_x, int x_step_q4, 586// const int16_t *filter_y, int y_step_q4, 587// int w, int h); 588// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, 589// uint8_t *dst, ptrdiff_t dst_stride, 590// const int16_t *filter_x, int x_step_q4, 591// const int16_t *filter_y, int y_step_q4, 592// int w, int h); 593FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); 594FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); 595 596// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, 597// uint8_t *dst, ptrdiff_t dst_stride, 598// const int16_t *filter_x, int x_step_q4, 599// const int16_t *filter_y, int y_step_q4, 600// int w, int h); 601FUN_CONV_2D(, avx2); 602#endif // HAVE_AX2 && HAVE_SSSE3 603