1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <immintrin.h> 12#include "vpx_ports/mem.h" 13 14// filters for 16_h8 and 16_v8 15DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { 16 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 17 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 18}; 19 20DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { 21 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 22 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 23}; 24 25DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { 26 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 27 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 28}; 29 30DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 31 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 32 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 33}; 34 35#if defined(__clang__) 36# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \ 37 (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0) 38# define MM256_BROADCASTSI128_SI256(x) \ 39 _mm_broadcastsi128_si256((__m128i const *)&(x)) 40# else // clang > 3.3, and not 5.0 on macosx. 41# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) 42# endif // clang <= 3.3 43#elif defined(__GNUC__) 44# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) 45# define MM256_BROADCASTSI128_SI256(x) \ 46 _mm_broadcastsi128_si256((__m128i const *)&(x)) 47# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 48# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) 49# else // gcc > 4.7 50# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) 51# endif // gcc <= 4.6 52#else // !(gcc || clang) 53# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) 54#endif // __clang__ 55 56void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, 57 unsigned int src_pixels_per_line, 58 unsigned char *output_ptr, 59 unsigned int output_pitch, 60 unsigned int output_height, 61 int16_t *filter) { 62 __m128i filtersReg; 63 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; 64 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; 65 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; 66 __m256i srcReg32b1, srcReg32b2, filtersReg32; 67 unsigned int i; 68 unsigned int src_stride, dst_stride; 69 70 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 71 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); 72 filtersReg = _mm_loadu_si128((__m128i *)filter); 73 // converting the 16 bit (short) to 8 bit (byte) and have the same data 74 // in both lanes of 128 bit register. 75 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 76 // have the same data in both lanes of a 256 bit register 77 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 78 79 // duplicate only the first 16 bits (first and second byte) 80 // across 256 bit register 81 firstFilters = _mm256_shuffle_epi8(filtersReg32, 82 _mm256_set1_epi16(0x100u)); 83 // duplicate only the second 16 bits (third and forth byte) 84 // across 256 bit register 85 secondFilters = _mm256_shuffle_epi8(filtersReg32, 86 _mm256_set1_epi16(0x302u)); 87 // duplicate only the third 16 bits (fifth and sixth byte) 88 // across 256 bit register 89 thirdFilters = _mm256_shuffle_epi8(filtersReg32, 90 _mm256_set1_epi16(0x504u)); 91 // duplicate only the forth 16 bits (seventh and eighth byte) 92 // across 256 bit register 93 forthFilters = _mm256_shuffle_epi8(filtersReg32, 94 _mm256_set1_epi16(0x706u)); 95 96 filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); 97 filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); 98 filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); 99 filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); 100 101 // multiple the size of the source and destination stride by two 102 src_stride = src_pixels_per_line << 1; 103 dst_stride = output_pitch << 1; 104 for (i = output_height; i > 1; i-=2) { 105 // load the 2 strides of source 106 srcReg32b1 = _mm256_castsi128_si256( 107 _mm_loadu_si128((__m128i *)(src_ptr-3))); 108 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, 109 _mm_loadu_si128((__m128i *) 110 (src_ptr+src_pixels_per_line-3)), 1); 111 112 // filter the source buffer 113 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); 114 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); 115 116 // multiply 2 adjacent elements with the filter and add the result 117 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); 118 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); 119 120 // add and saturate the results together 121 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); 122 123 // filter the source buffer 124 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); 125 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); 126 127 // multiply 2 adjacent elements with the filter and add the result 128 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); 129 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 130 131 // add and saturate the results together 132 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, 133 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); 134 135 // reading 2 strides of the next 16 bytes 136 // (part of it was being read by earlier read) 137 srcReg32b2 = _mm256_castsi128_si256( 138 _mm_loadu_si128((__m128i *)(src_ptr+5))); 139 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, 140 _mm_loadu_si128((__m128i *) 141 (src_ptr+src_pixels_per_line+5)), 1); 142 143 // add and saturate the results together 144 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, 145 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); 146 147 // filter the source buffer 148 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); 149 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); 150 151 // multiply 2 adjacent elements with the filter and add the result 152 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); 153 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); 154 155 // add and saturate the results together 156 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); 157 158 // filter the source buffer 159 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg); 160 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); 161 162 // multiply 2 adjacent elements with the filter and add the result 163 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); 164 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 165 166 // add and saturate the results together 167 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, 168 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); 169 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, 170 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); 171 172 173 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); 174 175 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); 176 177 // shift by 7 bit each 16 bit 178 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); 179 srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); 180 181 // shrink to 8 bit each 16 bits, the first lane contain the first 182 // convolve result and the second lane contain the second convolve 183 // result 184 srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, 185 srcRegFilt32b2_1); 186 187 src_ptr+=src_stride; 188 189 // save 16 bytes 190 _mm_store_si128((__m128i*)output_ptr, 191 _mm256_castsi256_si128(srcRegFilt32b1_1)); 192 193 // save the next 16 bits 194 _mm_store_si128((__m128i*)(output_ptr+output_pitch), 195 _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); 196 output_ptr+=dst_stride; 197 } 198 199 // if the number of strides is odd. 200 // process only 16 bytes 201 if (i > 0) { 202 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; 203 __m128i srcRegFilt2, srcRegFilt3; 204 205 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); 206 207 // filter the source buffer 208 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, 209 _mm256_castsi256_si128(filt1Reg)); 210 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, 211 _mm256_castsi256_si128(filt2Reg)); 212 213 // multiply 2 adjacent elements with the filter and add the result 214 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, 215 _mm256_castsi256_si128(firstFilters)); 216 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 217 _mm256_castsi256_si128(secondFilters)); 218 219 // add and saturate the results together 220 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); 221 222 // filter the source buffer 223 srcRegFilt3= _mm_shuffle_epi8(srcReg1, 224 _mm256_castsi256_si128(filt4Reg)); 225 srcRegFilt2= _mm_shuffle_epi8(srcReg1, 226 _mm256_castsi256_si128(filt3Reg)); 227 228 // multiply 2 adjacent elements with the filter and add the result 229 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, 230 _mm256_castsi256_si128(forthFilters)); 231 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 232 _mm256_castsi256_si128(thirdFilters)); 233 234 // add and saturate the results together 235 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 236 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 237 238 // reading the next 16 bytes 239 // (part of it was being read by earlier read) 240 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); 241 242 // add and saturate the results together 243 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 244 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 245 246 // filter the source buffer 247 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, 248 _mm256_castsi256_si128(filt1Reg)); 249 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, 250 _mm256_castsi256_si128(filt2Reg)); 251 252 // multiply 2 adjacent elements with the filter and add the result 253 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, 254 _mm256_castsi256_si128(firstFilters)); 255 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 256 _mm256_castsi256_si128(secondFilters)); 257 258 // add and saturate the results together 259 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); 260 261 // filter the source buffer 262 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, 263 _mm256_castsi256_si128(filt4Reg)); 264 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, 265 _mm256_castsi256_si128(filt3Reg)); 266 267 // multiply 2 adjacent elements with the filter and add the result 268 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, 269 _mm256_castsi256_si128(forthFilters)); 270 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 271 _mm256_castsi256_si128(thirdFilters)); 272 273 // add and saturate the results together 274 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 275 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 276 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 277 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 278 279 280 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 281 _mm256_castsi256_si128(addFilterReg64)); 282 283 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 284 _mm256_castsi256_si128(addFilterReg64)); 285 286 // shift by 7 bit each 16 bit 287 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); 288 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); 289 290 // shrink to 8 bit each 16 bits, the first lane contain the first 291 // convolve result and the second lane contain the second convolve 292 // result 293 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); 294 295 // save 16 bytes 296 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); 297 } 298} 299 300void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, 301 unsigned int src_pitch, 302 unsigned char *output_ptr, 303 unsigned int out_pitch, 304 unsigned int output_height, 305 int16_t *filter) { 306 __m128i filtersReg; 307 __m256i addFilterReg64; 308 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; 309 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; 310 __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32; 311 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; 312 unsigned int i; 313 unsigned int src_stride, dst_stride; 314 315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); 317 filtersReg = _mm_loadu_si128((__m128i *)filter); 318 // converting the 16 bit (short) to 8 bit (byte) and have the 319 // same data in both lanes of 128 bit register. 320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 321 // have the same data in both lanes of a 256 bit register 322 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 323 324 // duplicate only the first 16 bits (first and second byte) 325 // across 256 bit register 326 firstFilters = _mm256_shuffle_epi8(filtersReg32, 327 _mm256_set1_epi16(0x100u)); 328 // duplicate only the second 16 bits (third and forth byte) 329 // across 256 bit register 330 secondFilters = _mm256_shuffle_epi8(filtersReg32, 331 _mm256_set1_epi16(0x302u)); 332 // duplicate only the third 16 bits (fifth and sixth byte) 333 // across 256 bit register 334 thirdFilters = _mm256_shuffle_epi8(filtersReg32, 335 _mm256_set1_epi16(0x504u)); 336 // duplicate only the forth 16 bits (seventh and eighth byte) 337 // across 256 bit register 338 forthFilters = _mm256_shuffle_epi8(filtersReg32, 339 _mm256_set1_epi16(0x706u)); 340 341 // multiple the size of the source and destination stride by two 342 src_stride = src_pitch << 1; 343 dst_stride = out_pitch << 1; 344 345 // load 16 bytes 7 times in stride of src_pitch 346 srcReg32b1 = _mm256_castsi128_si256( 347 _mm_loadu_si128((__m128i *)(src_ptr))); 348 srcReg32b2 = _mm256_castsi128_si256( 349 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch))); 350 srcReg32b3 = _mm256_castsi128_si256( 351 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2))); 352 srcReg32b4 = _mm256_castsi128_si256( 353 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3))); 354 srcReg32b5 = _mm256_castsi128_si256( 355 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4))); 356 srcReg32b6 = _mm256_castsi128_si256( 357 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5))); 358 srcReg32b7 = _mm256_castsi128_si256( 359 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6))); 360 361 // have each consecutive loads on the same 256 register 362 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, 363 _mm256_castsi256_si128(srcReg32b2), 1); 364 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, 365 _mm256_castsi256_si128(srcReg32b3), 1); 366 srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, 367 _mm256_castsi256_si128(srcReg32b4), 1); 368 srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, 369 _mm256_castsi256_si128(srcReg32b5), 1); 370 srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, 371 _mm256_castsi256_si128(srcReg32b6), 1); 372 srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, 373 _mm256_castsi256_si128(srcReg32b7), 1); 374 375 // merge every two consecutive registers except the last one 376 srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); 377 srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); 378 379 // save 380 srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); 381 382 // save 383 srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); 384 385 // save 386 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); 387 388 // save 389 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); 390 391 392 for (i = output_height; i > 1; i-=2) { 393 // load the last 2 loads of 16 bytes and have every two 394 // consecutive loads in the same 256 bit register 395 srcReg32b8 = _mm256_castsi128_si256( 396 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7))); 397 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, 398 _mm256_castsi256_si128(srcReg32b8), 1); 399 srcReg32b9 = _mm256_castsi128_si256( 400 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8))); 401 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, 402 _mm256_castsi256_si128(srcReg32b9), 1); 403 404 // merge every two consecutive registers 405 // save 406 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); 407 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); 408 409 // multiply 2 adjacent elements with the filter and add the result 410 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); 411 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); 412 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); 413 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); 414 415 // add and saturate the results together 416 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); 417 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8); 418 419 420 // multiply 2 adjacent elements with the filter and add the result 421 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); 422 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); 423 424 // multiply 2 adjacent elements with the filter and add the result 425 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); 426 srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); 427 428 429 // add and saturate the results together 430 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, 431 _mm256_min_epi16(srcReg32b8, srcReg32b12)); 432 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, 433 _mm256_min_epi16(srcReg32b6, srcReg32b13)); 434 435 // add and saturate the results together 436 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, 437 _mm256_max_epi16(srcReg32b8, srcReg32b12)); 438 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, 439 _mm256_max_epi16(srcReg32b6, srcReg32b13)); 440 441 442 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); 443 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); 444 445 // shift by 7 bit each 16 bit 446 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); 447 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); 448 449 // shrink to 8 bit each 16 bits, the first lane contain the first 450 // convolve result and the second lane contain the second convolve 451 // result 452 srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); 453 454 src_ptr+=src_stride; 455 456 // save 16 bytes 457 _mm_store_si128((__m128i*)output_ptr, 458 _mm256_castsi256_si128(srcReg32b1)); 459 460 // save the next 16 bits 461 _mm_store_si128((__m128i*)(output_ptr+out_pitch), 462 _mm256_extractf128_si256(srcReg32b1, 1)); 463 464 output_ptr+=dst_stride; 465 466 // save part of the registers for next strides 467 srcReg32b10 = srcReg32b11; 468 srcReg32b1 = srcReg32b3; 469 srcReg32b11 = srcReg32b2; 470 srcReg32b3 = srcReg32b5; 471 srcReg32b2 = srcReg32b4; 472 srcReg32b5 = srcReg32b7; 473 srcReg32b7 = srcReg32b9; 474 } 475 if (i > 0) { 476 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; 477 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; 478 // load the last 16 bytes 479 srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); 480 481 // merge the last 2 results together 482 srcRegFilt4 = _mm_unpacklo_epi8( 483 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); 484 srcRegFilt7 = _mm_unpackhi_epi8( 485 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); 486 487 // multiply 2 adjacent elements with the filter and add the result 488 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), 489 _mm256_castsi256_si128(firstFilters)); 490 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, 491 _mm256_castsi256_si128(forthFilters)); 492 srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), 493 _mm256_castsi256_si128(firstFilters)); 494 srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, 495 _mm256_castsi256_si128(forthFilters)); 496 497 // add and saturate the results together 498 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); 499 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); 500 501 502 // multiply 2 adjacent elements with the filter and add the result 503 srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), 504 _mm256_castsi256_si128(secondFilters)); 505 srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), 506 _mm256_castsi256_si128(secondFilters)); 507 508 // multiply 2 adjacent elements with the filter and add the result 509 srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), 510 _mm256_castsi256_si128(thirdFilters)); 511 srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), 512 _mm256_castsi256_si128(thirdFilters)); 513 514 // add and saturate the results together 515 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 516 _mm_min_epi16(srcRegFilt4, srcRegFilt6)); 517 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, 518 _mm_min_epi16(srcRegFilt5, srcRegFilt7)); 519 520 // add and saturate the results together 521 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 522 _mm_max_epi16(srcRegFilt4, srcRegFilt6)); 523 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, 524 _mm_max_epi16(srcRegFilt5, srcRegFilt7)); 525 526 527 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 528 _mm256_castsi256_si128(addFilterReg64)); 529 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, 530 _mm256_castsi256_si128(addFilterReg64)); 531 532 // shift by 7 bit each 16 bit 533 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 534 srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); 535 536 // shrink to 8 bit each 16 bits, the first lane contain the first 537 // convolve result and the second lane contain the second convolve 538 // result 539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); 540 541 // save 16 bytes 542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); 543 } 544} 545