1/****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19*/ 20/** 21 ******************************************************************************* 22 * @file 23 * ih264e_half_pel_ssse3.c 24 * 25 * @brief 26 * Contains the x86 intrinsic function definitions for 6-tap vertical filter 27 * and cascaded 2D filter used in motion estimation in H264 encoder. 28 * 29 * @author 30 * Ittiam 31 * 32 * @par List of Functions: 33 * ih264e_sixtapfilter_horz_ssse3 34 * ih264e_sixtap_filter_2dvh_vert_ssse3 35 * 36 * @remarks 37 * None 38 * 39 ******************************************************************************* 40 */ 41 42/*****************************************************************************/ 43/* File Includes */ 44/*****************************************************************************/ 45 46/* System include files */ 47#include <stdio.h> 48#include <assert.h> 49#include <limits.h> 50 51/* User include files */ 52#include "ih264_typedefs.h" 53#include "ithread.h" 54#include "ih264_platform_macros.h" 55#include "ih264_defs.h" 56#include "ih264e_half_pel.h" 57#include "ih264_macros.h" 58#include "ih264e_debug.h" 59#include "ih264_inter_pred_filters.h" 60#include "ih264_mem_fns.h" 61#include "ih264_padding.h" 62#include "ih264_intra_pred_filters.h" 63#include "ih264_deblk_edge_filters.h" 64 65 66/*****************************************************************************/ 67/* Function Definitions */ 68/*****************************************************************************/ 69/* 70******************************************************************************* 71* 72* @brief 73* Interprediction luma filter for horizontal input(Filter run for width = 17 74* and height =16) 75* 76* @par Description: 77* Applies a 6 tap horizontal filter .The output is clipped to 8 bits sec. 78* 8.4.2.2.1 titled "Luma sample interpolation process" 79* 80* @param[in] pu1_src 81* UWORD8 pointer to the source 82* 83* @param[out] pu1_dst 84* UWORD8 pointer to the destination 85* 86* @param[in] src_strd 87* integer source stride 88* 89* @param[in] dst_strd 90* integer destination stride 91* 92* @returns 93* None 94* 95* @remarks 96* None 97* 98******************************************************************************* 99*/ 100void ih264e_sixtapfilter_horz_ssse3(UWORD8 *pu1_src, 101 UWORD8 *pu1_dst, 102 WORD32 src_strd, 103 WORD32 dst_strd) 104{ 105 WORD32 ht; 106 WORD32 tmp; 107 108 __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; 109 __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; 110 111 __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; 112 __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; 113 114 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 115 __m128i const_val16_8x16b; 116 117 ht = 16; 118 pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) 119 120 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 121 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 122 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 123 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 124 const_val16_8x16b = _mm_set1_epi16(16); 125 126 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 127 //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 128 //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. 129 130 do 131 { 132 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 133 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 134 135 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 136 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 137 138 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 139 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 140 141 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 142 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 143 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 144 //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 145 146 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 147 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 148 149 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 150 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 151 152 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 153 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 154 155 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 156 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 157 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 158 //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 159 160 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 161 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 162 163 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 164 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 165 166 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 167 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 168 169 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 170 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 171 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 172 //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 173 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); 174 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); 175 res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b); 176 res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b); 177 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); 178 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); 179 180 tmp = ((pu1_src[18] + pu1_src[19]) << 2) - pu1_src[17] - pu1_src[20]; 181 tmp = pu1_src[16] + pu1_src[21] + (tmp << 2) + tmp; 182 183 res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits. 184 res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); 185 tmp = (tmp + 16) >> 5; 186 187 src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b); 188 pu1_dst[16] = CLIP_U8(tmp); 189 190 _mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b); 191 192 ht--; 193 pu1_src += src_strd; 194 pu1_dst += dst_strd; 195 } 196 while(ht > 0); 197} 198 199/* 200******************************************************************************* 201* 202* @brief 203* This function implements a two stage cascaded six tap filter. It 204* applies the six tap filter in the vertical direction on the 205* predictor values, followed by applying the same filter in the 206* horizontal direction on the output of the first stage. The six tap 207* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample 208* interpolation process" (Filter run for width = 17 and height =17) 209* 210* @par Description: 211* The function interpolates the predictors first in the vertical direction 212* and then in the horizontal direction to output the (1/2,1/2). The output 213* of the first stage of the filter is stored in the buffer pointed to by 214* pi16_pred1(only in C) in 16 bit precision. 215* 216* @param[in] pu1_src 217* UWORD8 pointer to the source 218* 219* @param[out] pu1_dst1 220* UWORD8 pointer to the destination(Vertical filtered output) 221* 222* @param[out] pu1_dst2 223* UWORD8 pointer to the destination(out put after applying horizontal filter 224* to the intermediate vertical output) 225* 226* @param[in] src_strd 227* integer source stride 228 229* @param[in] dst_strd 230* integer destination stride of pu1_dst 231* 232* @param[in]pi16_pred1 233* Pointer to 16bit intermediate buffer(used only in c) 234* 235* @param[in] pi16_pred1_strd 236* integer destination stride of pi16_pred1 237* 238* @returns 239* None 240* 241* @remarks 242* None 243* 244******************************************************************************* 245*/ 246void ih264e_sixtap_filter_2dvh_vert_ssse3(UWORD8 *pu1_src, 247 UWORD8 *pu1_dst1, 248 UWORD8 *pu1_dst2, 249 WORD32 src_strd, 250 WORD32 dst_strd, 251 WORD32 *pi4_pred1, 252 WORD32 pred1_strd) 253{ 254 WORD32 ht; 255 WORD16 *pi2_pred1; 256 257 ht = 17; 258 pi2_pred1 = (WORD16 *)pi4_pred1; 259 pred1_strd = pred1_strd << 1; 260 261 // Vertical 6-tap filter 262 { 263 __m128i src1_r0_16x8b, src1_r1_16x8b, src1_r2_16x8b; 264 __m128i src1_r3_16x8b, src1_r4_16x8b, src1_r5_16x8b; 265 __m128i src2_r0_16x8b, src2_r1_16x8b, src2_r2_16x8b; 266 __m128i src2_r3_16x8b, src2_r4_16x8b, src2_r5_16x8b; 267 268 __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; 269 270 __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; 271 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 272 273 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 274 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 275 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 276 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 277 278 pu1_src -= 2; 279 pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3]) 280 281 // Loading first five rows to start first row processing. 282 // 22 values loaded in each row. 283 src1_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 284 src2_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); 285 pu1_src += src_strd; 286 287 src1_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 288 src2_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); 289 pu1_src += src_strd; 290 291 src1_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 292 src2_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); 293 pu1_src += src_strd; 294 295 src1_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 296 src2_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); 297 pu1_src += src_strd; 298 299 src1_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 300 src2_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); 301 pu1_src += src_strd; 302 303 do 304 { 305 src1_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 306 src2_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); 307 308 src_r0r1_16x8b = _mm_unpacklo_epi8(src1_r0_16x8b, src1_r1_16x8b); 309 src_r2r3_16x8b = _mm_unpacklo_epi8(src1_r2_16x8b, src1_r3_16x8b); 310 src_r4r5_16x8b = _mm_unpacklo_epi8(src1_r4_16x8b, src1_r5_16x8b); 311 312 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 313 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 314 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 315 316 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 317 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 318 319 _mm_storeu_si128((__m128i *)pi2_pred1, res_t1_8x16b); 320 321 src_r0r1_16x8b = _mm_unpackhi_epi8(src1_r0_16x8b, src1_r1_16x8b); 322 src_r2r3_16x8b = _mm_unpackhi_epi8(src1_r2_16x8b, src1_r3_16x8b); 323 src_r4r5_16x8b = _mm_unpackhi_epi8(src1_r4_16x8b, src1_r5_16x8b); 324 325 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 326 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 327 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 328 329 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 330 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 331 332 _mm_storeu_si128((__m128i *)(pi2_pred1 + 8), res_t1_8x16b); 333 334 src_r0r1_16x8b = _mm_unpacklo_epi8(src2_r0_16x8b, src2_r1_16x8b); 335 src_r2r3_16x8b = _mm_unpacklo_epi8(src2_r2_16x8b, src2_r3_16x8b); 336 src_r4r5_16x8b = _mm_unpacklo_epi8(src2_r4_16x8b, src2_r5_16x8b); 337 338 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 339 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 340 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 341 342 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 343 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 344 345 _mm_storeu_si128((__m128i *)(pi2_pred1 + 14), res_t1_8x16b); 346 347 src1_r0_16x8b = src1_r1_16x8b; 348 src1_r1_16x8b = src1_r2_16x8b; 349 src1_r2_16x8b = src1_r3_16x8b; 350 src1_r3_16x8b = src1_r4_16x8b; 351 src1_r4_16x8b = src1_r5_16x8b; 352 353 src2_r0_16x8b = src2_r1_16x8b; 354 src2_r1_16x8b = src2_r2_16x8b; 355 src2_r2_16x8b = src2_r3_16x8b; 356 src2_r3_16x8b = src2_r4_16x8b; 357 src2_r4_16x8b = src2_r5_16x8b; 358 359 ht--; 360 pu1_src += src_strd; 361 pi2_pred1 += pred1_strd; 362 } 363 while(ht > 0); 364 } 365 366 ht = 17; 367 pi2_pred1 = (WORD16 *)pi4_pred1; 368 369 // Horizontal 6-tap filter 370 { 371 WORD32 temp; 372 373 __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b; 374 __m128i src_r4_8x16b, src_r5_8x16b; 375 __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; 376 __m128i res_vert1_8x16b, res_vert2_8x16b, res_16x8b; 377 378 __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; 379 __m128i res_c0_8x16b, res_c1_8x16b; 380 381 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; 382 __m128i const_val512_4x32b, const_val16_8x16b; 383 384 coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); //c0 c1 c0 c1 c0 c1 c0 c1 385 coeff2_3_8x16b = _mm_set1_epi32(0x00140014); //c2 c3 c2 c3 c2 c3 c2 c3 386 coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); //c4 c5 c4 c5 c4 c5 c4 c5 387 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 388 const_val512_4x32b = _mm_set1_epi32(512); 389 const_val16_8x16b = _mm_set1_epi16(16); 390 391 do 392 { 393 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1)); 394 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 1)); 395 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 2)); 396 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 3)); 397 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 4)); 398 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 5)); 399 400 res_vert1_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b); 401 res_vert1_8x16b = _mm_srai_epi16(res_vert1_8x16b, 5); //shifting right by 5 bits. 402 403 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 404 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 405 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 406 407 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 408 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 409 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 410 411 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 412 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 413 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 414 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 415 416 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); 417 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); 418 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); 419 420 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 421 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 422 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 423 424 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 425 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 426 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 427 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 428 429 res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 430 431 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8)); 432 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 1)); 433 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 2)); 434 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 3)); 435 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 4)); 436 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 5)); 437 438 res_vert2_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b); 439 res_vert2_8x16b = _mm_srai_epi16(res_vert2_8x16b, 5); //shifting right by 5 bits. 440 441 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 442 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 443 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 444 445 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 446 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 447 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 448 449 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 450 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 451 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 452 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10); 453 454 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); 455 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); 456 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); 457 458 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 459 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 460 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 461 462 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 463 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 464 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 465 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 466 467 res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 468 469 res_16x8b = _mm_packus_epi16(res_vert1_8x16b, res_vert2_8x16b); 470 _mm_storeu_si128((__m128i *)pu1_dst1, res_16x8b); 471 pu1_dst1[16] = CLIP_U8((pi2_pred1[18] + 16) >> 5); 472 473 res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b); 474 _mm_storeu_si128((__m128i *)pu1_dst2, res_16x8b); 475 temp = ((pi2_pred1[18] + pi2_pred1[19]) << 2) - pi2_pred1[17] - pi2_pred1[20]; 476 temp = pi2_pred1[16] + pi2_pred1[21] + (temp << 2) + temp; 477 pu1_dst2[16] = CLIP_U8((temp + 512) >> 10); 478 479 ht--; 480 pi2_pred1 += pred1_strd; 481 pu1_dst1 += dst_strd; 482 pu1_dst2 += dst_strd; 483 } 484 while(ht > 0); 485 } 486} 487