1/****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19*/ 20/*****************************************************************************/ 21/* */ 22/* File Name : ih264_weighted_pred_intr_sse42.c */ 23/* */ 24/* Description : Contains function definitions for weighted */ 25/* prediction functions in x86 sse4 intrinsics */ 26/* */ 27/* List of Functions : ih264_default_weighted_pred_luma_sse42() */ 28/* ih264_default_weighted_pred_chroma_sse42() */ 29/* ih264_weighted_pred_luma_sse42() */ 30/* ih264_weighted_pred_chroma_sse42() */ 31/* ih264_weighted_bipred_luma_sse42() */ 32/* ih264_weighted_bipred_chroma_sse42() */ 33/* */ 34/* Issues / Problems : None */ 35/* */ 36/* Revision History : */ 37/* */ 38/* DD MM YYYY Author(s) Changes */ 39/* 30 01 2015 Kaushik Initial version */ 40/* Senthoor */ 41/* */ 42/*****************************************************************************/ 43/*****************************************************************************/ 44/* File Includes */ 45/*****************************************************************************/ 46 47#include <immintrin.h> 48#include "ih264_typedefs.h" 49#include "ih264_macros.h" 50#include "ih264_platform_macros.h" 51#include "ih264_weighted_pred.h" 52 53/*****************************************************************************/ 54/* Function definitions . */ 55/*****************************************************************************/ 56/*****************************************************************************/ 57/* */ 58/* Function Name : ih264_default_weighted_pred_luma_sse42 */ 59/* */ 60/* Description : This function performs the default weighted prediction */ 61/* as described in sec 8.4.2.3.1 titled "Default weighted */ 62/* sample prediction process" for luma. The function gets */ 63/* two ht x wd blocks, calculates their rounded-average and */ 64/* stores it in the destination block. (ht,wd) can be */ 65/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ 66/* */ 67/* Inputs : pu1_src1 - Pointer to source 1 */ 68/* pu1_src2 - Pointer to source 2 */ 69/* pu1_dst - Pointer to destination */ 70/* src_strd1 - stride for source 1 */ 71/* src_strd1 - stride for source 2 */ 72/* dst_strd - stride for destination */ 73/* ht - height of the block */ 74/* wd - width of the block */ 75/* */ 76/* Issues : None */ 77/* */ 78/* Revision History: */ 79/* */ 80/* DD MM YYYY Author(s) Changes */ 81/* 04 02 2015 Kaushik Initial Version */ 82/* Senthoor */ 83/* */ 84/*****************************************************************************/ 85void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1, 86 UWORD8 *pu1_src2, 87 UWORD8 *pu1_dst, 88 WORD32 src_strd1, 89 WORD32 src_strd2, 90 WORD32 dst_strd, 91 WORD32 ht, 92 WORD32 wd) 93{ 94 __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b; 95 __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b; 96 97 if(wd == 4) 98 { 99 do 100 { 101 y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 102 y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 103 y0_2_16x8b = _mm_loadl_epi64( 104 (__m128i *)(pu1_src1 + (src_strd1 << 1))); 105 y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); 106 107 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 108 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 109 y1_2_16x8b = _mm_loadl_epi64( 110 (__m128i *)(pu1_src2 + (src_strd2 << 1))); 111 y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); 112 113 y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b); 114 y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b); 115 y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); 116 y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); 117 118 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b); 119 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b); 120 *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b); 121 *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b); 122 123 ht -= 4; 124 pu1_src1 += src_strd1 << 2; 125 pu1_src2 += src_strd2 << 2; 126 pu1_dst += dst_strd << 2; 127 } 128 while(ht > 0); 129 } 130 else if(wd == 8) 131 { 132 do 133 { 134 y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 135 y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 136 y0_2_16x8b = _mm_loadl_epi64( 137 (__m128i *)(pu1_src1 + (src_strd1 << 1))); 138 y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); 139 140 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 141 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 142 y1_2_16x8b = _mm_loadl_epi64( 143 (__m128i *)(pu1_src2 + (src_strd2 << 1))); 144 y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); 145 146 y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b); 147 y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b); 148 y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); 149 y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); 150 151 _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b); 152 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b); 153 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b); 154 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b); 155 156 ht -= 4; 157 pu1_src1 += src_strd1 << 2; 158 pu1_src2 += src_strd2 << 2; 159 pu1_dst += dst_strd << 2; 160 } 161 while(ht > 0); 162 } 163 else // wd == 16 164 { 165 __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b; 166 __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b; 167 168 do 169 { 170 y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); 171 y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); 172 y0_2_16x8b = _mm_loadu_si128( 173 (__m128i *)(pu1_src1 + (src_strd1 << 1))); 174 y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3)); 175 y0_4_16x8b = _mm_loadu_si128( 176 (__m128i *)(pu1_src1 + (src_strd1 << 2))); 177 y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5)); 178 y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6)); 179 y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7)); 180 181 y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); 182 y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); 183 y1_2_16x8b = _mm_loadu_si128( 184 (__m128i *)(pu1_src2 + (src_strd2 << 1))); 185 y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3)); 186 y1_4_16x8b = _mm_loadu_si128( 187 (__m128i *)(pu1_src2 + (src_strd2 << 2))); 188 y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5)); 189 y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6)); 190 y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7)); 191 192 y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b); 193 y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b); 194 y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); 195 y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); 196 y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b); 197 y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b); 198 y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b); 199 y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b); 200 201 _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b); 202 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b); 203 _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b); 204 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b); 205 _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b); 206 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b); 207 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b); 208 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b); 209 210 ht -= 8; 211 pu1_src1 += src_strd1 << 3; 212 pu1_src2 += src_strd2 << 3; 213 pu1_dst += dst_strd << 3; 214 } 215 while(ht > 0); 216 } 217} 218 219/*****************************************************************************/ 220/* */ 221/* Function Name : ih264_default_weighted_pred_chroma_sse42 */ 222/* */ 223/* Description : This function performs the default weighted prediction */ 224/* as described in sec 8.4.2.3.1 titled "Default weighted */ 225/* sample prediction process" for chroma. The function gets */ 226/* two ht x wd blocks, calculates their rounded-average and */ 227/* stores it in the destination block. (ht,wd) can be */ 228/* (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8). */ 229/* */ 230/* Inputs : pu1_src1 - Pointer to source 1 */ 231/* pu1_src2 - Pointer to source 2 */ 232/* pu1_dst - Pointer to destination */ 233/* src_strd1 - stride for source 1 */ 234/* src_strd1 - stride for source 2 */ 235/* dst_strd - stride for destination */ 236/* ht - height of the block */ 237/* wd - width of the block */ 238/* */ 239/* Issues : None */ 240/* */ 241/* Revision History: */ 242/* */ 243/* DD MM YYYY Author(s) Changes */ 244/* 04 02 2015 Kaushik Initial Version */ 245/* Senthoor */ 246/* */ 247/*****************************************************************************/ 248void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1, 249 UWORD8 *pu1_src2, 250 UWORD8 *pu1_dst, 251 WORD32 src_strd1, 252 WORD32 src_strd2, 253 WORD32 dst_strd, 254 WORD32 ht, 255 WORD32 wd) 256{ 257 __m128i uv0_0_16x8b, uv0_1_16x8b; 258 __m128i uv1_0_16x8b, uv1_1_16x8b; 259 260 if(wd == 2) 261 { 262 do 263 { 264 uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 265 uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 266 267 uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 268 uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 269 270 uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); 271 uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); 272 273 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b); 274 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b); 275 276 ht -= 2; 277 pu1_src1 += src_strd1 << 1; 278 pu1_src2 += src_strd2 << 1; 279 pu1_dst += dst_strd << 1; 280 } 281 while(ht > 0); 282 } 283 else if(wd == 4) 284 { 285 do 286 { 287 uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 288 uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 289 290 uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 291 uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 292 293 uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); 294 uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); 295 296 _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b); 297 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b); 298 299 ht -= 2; 300 pu1_src1 += src_strd1 << 1; 301 pu1_src2 += src_strd2 << 1; 302 pu1_dst += dst_strd << 1; 303 } 304 while(ht > 0); 305 } 306 else // wd == 8 307 { 308 __m128i uv0_2_16x8b, uv0_3_16x8b; 309 __m128i uv1_2_16x8b, uv1_3_16x8b; 310 311 do 312 { 313 uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); 314 uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); 315 uv0_2_16x8b = _mm_loadu_si128( 316 (__m128i *)(pu1_src1 + (src_strd1 << 1))); 317 uv0_3_16x8b = _mm_loadu_si128( 318 (__m128i *)(pu1_src1 + src_strd1 * 3)); 319 320 uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); 321 uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); 322 uv1_2_16x8b = _mm_loadu_si128( 323 (__m128i *)(pu1_src2 + (src_strd2 << 1))); 324 uv1_3_16x8b = _mm_loadu_si128( 325 (__m128i *)(pu1_src2 + src_strd2 * 3)); 326 327 uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); 328 uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); 329 uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b); 330 uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b); 331 332 _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b); 333 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b); 334 _mm_storeu_si128( 335 (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b); 336 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b); 337 338 ht -= 4; 339 pu1_src1 += src_strd1 << 2; 340 pu1_src2 += src_strd2 << 2; 341 pu1_dst += dst_strd << 2; 342 } 343 while(ht > 0); 344 } 345} 346 347/*****************************************************************************/ 348/* */ 349/* Function Name : ih264_weighted_pred_luma_sse42 */ 350/* */ 351/* Description : This function performs the weighted prediction as */ 352/* described in sec 8.4.2.3.2 titled "Weighted sample */ 353/* prediction process" for luma. The function gets one */ 354/* ht x wd block, weights it, rounds it off, offsets it, */ 355/* saturates it to unsigned 8-bit and stores it in the */ 356/* destination block. (ht,wd) can be (4,4), (8,4), (4,8), */ 357/* (8,8), (16,8), (8,16) or (16,16). */ 358/* */ 359/* Inputs : pu1_src - Pointer to source */ 360/* pu1_dst - Pointer to destination */ 361/* src_strd - stride for source */ 362/* dst_strd - stride for destination */ 363/* log_wd - number of bits to be rounded off */ 364/* wt - weight value */ 365/* ofst - offset value */ 366/* ht - height of the block */ 367/* wd - width of the block */ 368/* */ 369/* Issues : None */ 370/* */ 371/* Revision History: */ 372/* */ 373/* DD MM YYYY Author(s) Changes */ 374/* 04 02 2015 Kaushik Initial Version */ 375/* Senthoor */ 376/* */ 377/*****************************************************************************/ 378void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src, 379 UWORD8 *pu1_dst, 380 WORD32 src_strd, 381 WORD32 dst_strd, 382 WORD32 log_wd, 383 WORD32 wt, 384 WORD32 ofst, 385 WORD32 ht, 386 WORD32 wd) 387{ 388 __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b; 389 390 __m128i wt_8x16b, round_8x16b, ofst_8x16b; 391 392 WORD32 round_val; 393 394 wt = (WORD16)(wt & 0xffff); 395 round_val = 1 << (log_wd - 1); 396 ofst = (WORD8)(ofst & 0xff); 397 398 wt_8x16b = _mm_set1_epi16(wt); 399 round_8x16b = _mm_set1_epi16(round_val); 400 ofst_8x16b = _mm_set1_epi16(ofst); 401 402 if(wd == 4) 403 { 404 __m128i y_0_8x16b, y_2_8x16b; 405 406 do 407 { 408 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 409 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 410 y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1))); 411 y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3)); 412 413 y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b); 414 y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b); 415 416 y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); 417 y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); 418 419 y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); 420 y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b); 421 422 y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); 423 y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b); 424 425 y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); 426 y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd); 427 428 y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); 429 y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b); 430 431 y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b); 432 y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4); 433 y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8); 434 y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12); 435 436 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b); 437 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b); 438 *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b); 439 *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b); 440 441 ht -= 4; 442 pu1_src += src_strd << 2; 443 pu1_dst += dst_strd << 2; 444 } 445 while(ht > 0); 446 } 447 else if(wd == 8) 448 { 449 __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b; 450 451 do 452 { 453 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 454 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 455 y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1))); 456 y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3)); 457 458 y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); 459 y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); 460 y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); 461 y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b); 462 463 y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); 464 y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b); 465 y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b); 466 y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b); 467 468 y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); 469 y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b); 470 y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b); 471 y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b); 472 473 y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); 474 y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd); 475 y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd); 476 y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd); 477 478 y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); 479 y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b); 480 y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b); 481 y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b); 482 483 y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b); 484 y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b); 485 y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8); 486 y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8); 487 488 _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b); 489 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); 490 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b); 491 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b); 492 493 ht -= 4; 494 pu1_src += src_strd << 2; 495 pu1_dst += dst_strd << 2; 496 } 497 while(ht > 0); 498 } 499 else // wd == 16 500 { 501 __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b; 502 __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b; 503 504 __m128i zero_16x8b; 505 zero_16x8b = _mm_set1_epi8(0); 506 507 do 508 { 509 y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 510 y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); 511 y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1))); 512 y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3)); 513 514 y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); 515 y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b); 516 y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); 517 y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b); 518 y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); 519 y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b); 520 y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b); 521 y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b); 522 523 y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b); 524 y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b); 525 y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b); 526 y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b); 527 y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b); 528 y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b); 529 y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b); 530 y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b); 531 532 y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b); 533 y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b); 534 y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b); 535 y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b); 536 y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b); 537 y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b); 538 y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b); 539 y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b); 540 541 y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd); 542 y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd); 543 y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd); 544 y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd); 545 y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd); 546 y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd); 547 y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd); 548 y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd); 549 550 y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b); 551 y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b); 552 y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b); 553 y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b); 554 y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b); 555 y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b); 556 y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b); 557 y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b); 558 559 y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b); 560 y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b); 561 y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b); 562 y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b); 563 564 _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b); 565 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); 566 _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b); 567 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b); 568 569 ht -= 4; 570 pu1_src += src_strd << 2; 571 pu1_dst += dst_strd << 2; 572 } 573 while(ht > 0); 574 } 575} 576 577/*****************************************************************************/ 578/* */ 579/* Function Name : ih264_weighted_pred_chroma_sse42 */ 580/* */ 581/* Description : This function performs the weighted prediction as */ 582/* described in sec 8.4.2.3.2 titled "Weighted sample */ 583/* prediction process" for chroma. The function gets one */ 584/* ht x wd block, weights it, rounds it off, offsets it, */ 585/* saturates it to unsigned 8-bit and stores it in the */ 586/* destination block. (ht,wd) can be (2,2), (4,2), (2,4), */ 587/* (4,4), (8,4), (4,8) or (8,8). */ 588/* */ 589/* Inputs : pu1_src - Pointer to source */ 590/* pu1_dst - Pointer to destination */ 591/* src_strd - stride for source */ 592/* dst_strd - stride for destination */ 593/* log_wd - number of bits to be rounded off */ 594/* wt - weight values for u and v */ 595/* ofst - offset values for u and v */ 596/* ht - height of the block */ 597/* wd - width of the block */ 598/* */ 599/* Issues : None */ 600/* */ 601/* Revision History: */ 602/* */ 603/* DD MM YYYY Author(s) Changes */ 604/* 04 02 2015 Kaushik Initial Version */ 605/* Senthoor */ 606/* */ 607/*****************************************************************************/ 608void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src, 609 UWORD8 *pu1_dst, 610 WORD32 src_strd, 611 WORD32 dst_strd, 612 WORD32 log_wd, 613 WORD32 wt, 614 WORD32 ofst, 615 WORD32 ht, 616 WORD32 wd) 617{ 618 __m128i y_0_16x8b, y_1_16x8b; 619 620 __m128i wt_8x16b, round_8x16b, ofst_8x16b; 621 622 WORD32 ofst_u, ofst_v; 623 WORD32 round_val; 624 625 ofst_u = (WORD8)(ofst & 0xff); 626 ofst_v = (WORD8)(ofst >> 8); 627 round_val = 1 << (log_wd - 1); 628 ofst = (ofst_u & 0xffff) | (ofst_v << 16); 629 630 wt_8x16b = _mm_set1_epi32(wt); 631 round_8x16b = _mm_set1_epi16(round_val); 632 ofst_8x16b = _mm_set1_epi32(ofst); 633 634 if(wd == 2) 635 { 636 __m128i y_0_8x16b; 637 638 do 639 { 640 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 641 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 642 643 y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b); 644 645 y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); 646 647 y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); 648 649 y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); 650 651 y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); 652 653 y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); 654 655 y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b); 656 y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4); 657 658 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b); 659 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b); 660 661 ht -= 2; 662 pu1_src += src_strd << 1; 663 pu1_dst += dst_strd << 1; 664 } 665 while(ht > 0); 666 } 667 else if(wd == 4) 668 { 669 __m128i y_0_8x16b, y_1_8x16b; 670 671 do 672 { 673 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 674 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 675 676 y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); 677 y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); 678 679 y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); 680 y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b); 681 682 y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); 683 y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b); 684 685 y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); 686 y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd); 687 688 y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); 689 y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b); 690 691 y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b); 692 y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8); 693 694 _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b); 695 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); 696 697 ht -= 2; 698 pu1_src += src_strd << 1; 699 pu1_dst += dst_strd << 1; 700 } 701 while(ht > 0); 702 } 703 else // wd == 16 704 { 705 __m128i y_2_16x8b, y_3_16x8b; 706 __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b; 707 __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b; 708 709 __m128i zero_16x8b; 710 zero_16x8b = _mm_set1_epi8(0); 711 712 do 713 { 714 y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 715 y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); 716 y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1))); 717 y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3)); 718 719 y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); 720 y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b); 721 y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); 722 y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b); 723 y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); 724 y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b); 725 y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b); 726 y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b); 727 728 y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b); 729 y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b); 730 y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b); 731 y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b); 732 y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b); 733 y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b); 734 y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b); 735 y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b); 736 737 y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b); 738 y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b); 739 y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b); 740 y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b); 741 y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b); 742 y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b); 743 y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b); 744 y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b); 745 746 y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd); 747 y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd); 748 y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd); 749 y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd); 750 y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd); 751 y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd); 752 y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd); 753 y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd); 754 755 y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b); 756 y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b); 757 y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b); 758 y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b); 759 y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b); 760 y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b); 761 y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b); 762 y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b); 763 764 y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b); 765 y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b); 766 y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b); 767 y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b); 768 769 _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b); 770 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); 771 _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b); 772 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b); 773 774 ht -= 4; 775 pu1_src += src_strd << 2; 776 pu1_dst += dst_strd << 2; 777 } 778 while(ht > 0); 779 } 780} 781 782/*****************************************************************************/ 783/* */ 784/* Function Name : ih264_weighted_bi_pred_luma_sse42 */ 785/* */ 786/* Description : This function performs the weighted biprediction as */ 787/* described in sec 8.4.2.3.2 titled "Weighted sample */ 788/* prediction process" for luma. The function gets two */ 789/* ht x wd blocks, weights them, adds them, rounds off the */ 790/* sum, offsets it, saturates it to unsigned 8-bit and */ 791/* stores it in the destination block. (ht,wd) can be */ 792/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ 793/* */ 794/* Inputs : pu1_src1 - Pointer to source 1 */ 795/* pu1_src2 - Pointer to source 2 */ 796/* pu1_dst - Pointer to destination */ 797/* src_strd1 - stride for source 1 */ 798/* src_strd2 - stride for source 2 */ 799/* dst_strd2 - stride for destination */ 800/* log_wd - number of bits to be rounded off */ 801/* wt1 - weight value for source 1 */ 802/* wt2 - weight value for source 2 */ 803/* ofst1 - offset value for source 1 */ 804/* ofst2 - offset value for source 2 */ 805/* ht - height of the block */ 806/* wd - width of the block */ 807/* */ 808/* Issues : None */ 809/* */ 810/* Revision History: */ 811/* */ 812/* DD MM YYYY Author(s) Changes */ 813/* 04 02 2015 Kaushik Initial Version */ 814/* Senthoor */ 815/* */ 816/*****************************************************************************/ 817void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1, 818 UWORD8 *pu1_src2, 819 UWORD8 *pu1_dst, 820 WORD32 src_strd1, 821 WORD32 src_strd2, 822 WORD32 dst_strd, 823 WORD32 log_wd, 824 WORD32 wt1, 825 WORD32 wt2, 826 WORD32 ofst1, 827 WORD32 ofst2, 828 WORD32 ht, 829 WORD32 wd) 830{ 831 __m128i y1_0_16x8b, y1_1_16x8b; 832 __m128i y2_0_16x8b, y2_1_16x8b; 833 834 __m128i wt1_8x16b, wt2_8x16b; 835 __m128i ofst_8x16b, round_8x16b; 836 837 WORD32 ofst; 838 WORD32 round_val, shft; 839 840 wt1 = (WORD16)(wt1 & 0xffff); 841 wt2 = (WORD16)(wt2 & 0xffff); 842 round_val = 1 << log_wd; 843 shft = log_wd + 1; 844 ofst1 = (WORD8)(ofst1 & 0xff); 845 ofst2 = (WORD8)(ofst2 & 0xff); 846 ofst = (ofst1 + ofst2 + 1) >> 1; 847 848 wt1_8x16b = _mm_set1_epi16(wt1); 849 wt2_8x16b = _mm_set1_epi16(wt2); 850 round_8x16b = _mm_set1_epi16(round_val); 851 ofst_8x16b = _mm_set1_epi16(ofst); 852 853 if(wd == 4) 854 { 855 __m128i y1_2_16x8b, y1_3_16x8b; 856 __m128i y2_2_16x8b, y2_3_16x8b; 857 858 __m128i y1_0_8x16b, y1_2_8x16b; 859 __m128i y2_0_8x16b, y2_2_8x16b; 860 861 do 862 { 863 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 864 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 865 y1_2_16x8b = _mm_loadl_epi64( 866 (__m128i *)(pu1_src1 + (src_strd1 << 1))); 867 y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); 868 869 y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 870 y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 871 y2_2_16x8b = _mm_loadl_epi64( 872 (__m128i *)(pu1_src2 + (src_strd2 << 1))); 873 y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); 874 875 y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b); 876 y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b); 877 y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b); 878 y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b); 879 880 y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); 881 y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b); 882 y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); 883 y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b); 884 885 y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); 886 y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); 887 y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b); 888 y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b); 889 890 y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); 891 y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b); 892 893 y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); 894 y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b); 895 896 y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); 897 y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft); 898 899 y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); 900 y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b); 901 902 y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b); 903 y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4); 904 y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8); 905 y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12); 906 907 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b); 908 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b); 909 *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b); 910 *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b); 911 912 913 ht -= 4; 914 pu1_src1 += src_strd1 << 2; 915 pu1_src2 += src_strd2 << 2; 916 pu1_dst += dst_strd << 2; 917 } 918 while(ht > 0); 919 } 920 else if(wd == 8) 921 { 922 __m128i y1_2_16x8b, y1_3_16x8b; 923 __m128i y2_2_16x8b, y2_3_16x8b; 924 925 __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b; 926 __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b; 927 928 do 929 { 930 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 931 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 932 y1_2_16x8b = _mm_loadl_epi64( 933 (__m128i *)(pu1_src1 + (src_strd1 << 1))); 934 y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); 935 936 y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 937 y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 938 y2_2_16x8b = _mm_loadl_epi64( 939 (__m128i *)(pu1_src2 + (src_strd2 << 1))); 940 y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); 941 942 y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); 943 y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); 944 y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b); 945 y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b); 946 947 y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); 948 y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); 949 y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b); 950 y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b); 951 952 y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); 953 y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); 954 y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b); 955 y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b); 956 957 y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b); 958 y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b); 959 y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b); 960 y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b); 961 962 y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); 963 y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b); 964 y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b); 965 y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b); 966 967 y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); 968 y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b); 969 y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b); 970 y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b); 971 972 y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); 973 y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft); 974 y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft); 975 y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft); 976 977 y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); 978 y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b); 979 y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b); 980 y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b); 981 982 y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b); 983 y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b); 984 y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8); 985 y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8); 986 987 _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b); 988 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); 989 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b); 990 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b); 991 992 ht -= 4; 993 pu1_src1 += src_strd1 << 2; 994 pu1_src2 += src_strd2 << 2; 995 pu1_dst += dst_strd << 2; 996 } 997 while(ht > 0); 998 } 999 else // wd == 16 1000 { 1001 __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b; 1002 __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b; 1003 1004 __m128i zero_16x8b; 1005 zero_16x8b = _mm_set1_epi8(0); 1006 1007 do 1008 { 1009 y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); 1010 y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); 1011 y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); 1012 y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); 1013 1014 y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); 1015 y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b); 1016 y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); 1017 y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b); 1018 1019 y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); 1020 y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b); 1021 y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); 1022 y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b); 1023 1024 y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b); 1025 y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b); 1026 y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b); 1027 y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b); 1028 1029 y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b); 1030 y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b); 1031 y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b); 1032 y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b); 1033 1034 y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b); 1035 y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b); 1036 y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b); 1037 y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b); 1038 1039 y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b); 1040 y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b); 1041 y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b); 1042 y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b); 1043 1044 y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft); 1045 y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft); 1046 y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft); 1047 y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft); 1048 1049 y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b); 1050 y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b); 1051 y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b); 1052 y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b); 1053 1054 y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b); 1055 y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b); 1056 1057 _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b); 1058 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); 1059 1060 ht -= 2; 1061 pu1_src1 += src_strd1 << 1; 1062 pu1_src2 += src_strd2 << 1; 1063 pu1_dst += dst_strd << 1; 1064 } 1065 while(ht > 0); 1066 } 1067} 1068 1069/*****************************************************************************/ 1070/* */ 1071/* Function Name : ih264_weighted_bi_pred_chroma_sse42 */ 1072/* */ 1073/* Description : This function performs the weighted biprediction as */ 1074/* described in sec 8.4.2.3.2 titled "Weighted sample */ 1075/* prediction process" for chroma. The function gets two */ 1076/* ht x wd blocks, weights them, adds them, rounds off the */ 1077/* sum, offsets it, saturates it to unsigned 8-bit and */ 1078/* stores it in the destination block. (ht,wd) can be */ 1079/* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */ 1080/* */ 1081/* Inputs : pu1_src1 - Pointer to source 1 */ 1082/* pu1_src2 - Pointer to source 2 */ 1083/* pu1_dst - Pointer to destination */ 1084/* src_strd1 - stride for source 1 */ 1085/* src_strd2 - stride for source 2 */ 1086/* dst_strd2 - stride for destination */ 1087/* log_wd - number of bits to be rounded off */ 1088/* wt1 - weight values for u and v in source 1 */ 1089/* wt2 - weight values for u and v in source 2 */ 1090/* ofst1 - offset value for u and v in source 1 */ 1091/* ofst2 - offset value for u and v in source 2 */ 1092/* ht - height of the block */ 1093/* wd - width of the block */ 1094/* */ 1095/* Issues : None */ 1096/* */ 1097/* Revision History: */ 1098/* */ 1099/* DD MM YYYY Author(s) Changes */ 1100/* 04 02 2015 Kaushik Initial Version */ 1101/* Senthoor */ 1102/* */ 1103/*****************************************************************************/ 1104void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1, 1105 UWORD8 *pu1_src2, 1106 UWORD8 *pu1_dst, 1107 WORD32 src_strd1, 1108 WORD32 src_strd2, 1109 WORD32 dst_strd, 1110 WORD32 log_wd, 1111 WORD32 wt1, 1112 WORD32 wt2, 1113 WORD32 ofst1, 1114 WORD32 ofst2, 1115 WORD32 ht, 1116 WORD32 wd) 1117{ 1118 __m128i y1_0_16x8b, y1_1_16x8b; 1119 __m128i y2_0_16x8b, y2_1_16x8b; 1120 1121 __m128i wt1_8x16b, wt2_8x16b; 1122 __m128i ofst_8x16b, round_8x16b; 1123 1124 WORD32 ofst1_u, ofst2_u, ofst_u; 1125 WORD32 ofst1_v, ofst2_v, ofst_v; 1126 WORD32 round_val, shft, ofst_val; 1127 1128 round_val = 1 << log_wd; 1129 shft = log_wd + 1; 1130 1131 ofst1_u = (WORD8)(ofst1 & 0xff); 1132 ofst1_v = (WORD8)(ofst1 >> 8); 1133 ofst2_u = (WORD8)(ofst2 & 0xff); 1134 ofst2_v = (WORD8)(ofst2 >> 8); 1135 1136 wt1_8x16b = _mm_set1_epi32(wt1); 1137 wt2_8x16b = _mm_set1_epi32(wt2); 1138 1139 ofst_u = (ofst1_u + ofst2_u + 1) >> 1; 1140 ofst_v = (ofst1_v + ofst2_v + 1) >> 1; 1141 ofst_val = (ofst_u & 0xffff) | (ofst_v << 16); 1142 1143 round_8x16b = _mm_set1_epi16(round_val); 1144 ofst_8x16b = _mm_set1_epi32(ofst_val); 1145 1146 if(wd == 2) 1147 { 1148 __m128i y1_0_8x16b, y2_0_8x16b; 1149 1150 do 1151 { 1152 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 1153 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 1154 1155 y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 1156 y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 1157 1158 y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b); 1159 y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b); 1160 1161 y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); 1162 y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); 1163 1164 y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); 1165 y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); 1166 1167 y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); 1168 y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); 1169 1170 y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); 1171 y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); 1172 1173 y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b); 1174 y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4); 1175 1176 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b); 1177 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b); 1178 1179 ht -= 2; 1180 pu1_src1 += src_strd1 << 1; 1181 pu1_src2 += src_strd2 << 1; 1182 pu1_dst += dst_strd << 1; 1183 } 1184 while(ht > 0); 1185 } 1186 else if(wd == 4) 1187 { 1188 __m128i y1_0_8x16b, y1_1_8x16b; 1189 __m128i y2_0_8x16b, y2_1_8x16b; 1190 1191 do 1192 { 1193 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 1194 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 1195 1196 y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 1197 y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 1198 1199 y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); 1200 y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); 1201 1202 y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); 1203 y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); 1204 1205 y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); 1206 y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); 1207 y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b); 1208 y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b); 1209 1210 y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); 1211 y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b); 1212 1213 y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); 1214 y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b); 1215 1216 y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); 1217 y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft); 1218 1219 y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); 1220 y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b); 1221 1222 y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b); 1223 y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8); 1224 1225 _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b); 1226 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); 1227 1228 ht -= 2; 1229 pu1_src1 += src_strd1 << 1; 1230 pu1_src2 += src_strd2 << 1; 1231 pu1_dst += dst_strd << 1; 1232 } 1233 while(ht > 0); 1234 } 1235 else // wd == 8 1236 { 1237 __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b; 1238 __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b; 1239 1240 __m128i zero_16x8b; 1241 zero_16x8b = _mm_set1_epi8(0); 1242 1243 do 1244 { 1245 y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); 1246 y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); 1247 y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); 1248 y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); 1249 1250 y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); 1251 y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b); 1252 y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); 1253 y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b); 1254 1255 y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); 1256 y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b); 1257 y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); 1258 y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b); 1259 1260 y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b); 1261 y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b); 1262 y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b); 1263 y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b); 1264 1265 y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b); 1266 y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b); 1267 y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b); 1268 y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b); 1269 1270 y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b); 1271 y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b); 1272 y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b); 1273 y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b); 1274 1275 y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b); 1276 y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b); 1277 y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b); 1278 y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b); 1279 1280 y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft); 1281 y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft); 1282 y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft); 1283 y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft); 1284 1285 y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b); 1286 y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b); 1287 y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b); 1288 y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b); 1289 1290 y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b); 1291 y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b); 1292 1293 _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b); 1294 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); 1295 1296 ht -= 2; 1297 pu1_src1 += src_strd1 << 1; 1298 pu1_src2 += src_strd2 << 1; 1299 pu1_dst += dst_strd << 1; 1300 } 1301 while(ht > 0); 1302 } 1303} 1304