1/****************************************************************************** 2* 3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4* 5* Licensed under the Apache License, Version 2.0 (the "License"); 6* you may not use this file except in compliance with the License. 7* You may obtain a copy of the License at: 8* 9* http://www.apache.org/licenses/LICENSE-2.0 10* 11* Unless required by applicable law or agreed to in writing, software 12* distributed under the License is distributed on an "AS IS" BASIS, 13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14* See the License for the specific language governing permissions and 15* limitations under the License. 16* 17******************************************************************************/ 18/** 19******************************************************************************* 20* @file 21* ihevc_weighted_pred_neon_intr.c 22* 23* @brief 24* Contains function definitions for weighted prediction used in inter 25* prediction 26* 27* @author 28* Parthiban V 29* 30* @par List of Functions: 31* - ihevc_weighted_pred_uni() 32* - ihevc_weighted_pred_bi() 33* - ihevc_weighted_pred_bi_default() 34* 35* @remarks 36* None 37* 38******************************************************************************* 39*/ 40/*****************************************************************************/ 41/* File Includes */ 42/*****************************************************************************/ 43#include "ihevc_typedefs.h" 44#include "ihevc_defs.h" 45#include "ihevc_macros.h" 46#include "ihevc_func_selector.h" 47#include "ihevc_inter_pred.h" 48#include "arm_neon.h" 49 50 51/** 52******************************************************************************* 53* 54* @brief 55* Does uni-weighted prediction on the array pointed by pi2_src and stores 56* it at the location pointed by pi2_dst Assumptions : The function is 57* optimized considering the fact Width and height are multiple of 2. 58* 59* @par Description: 60* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + 61* offset 62* 63* @param[in] pi2_src 64* Pointer to the source 65* 66* @param[out] pu1_dst 67* Pointer to the destination 68* 69* @param[in] src_strd 70* Source stride 71* 72* @param[in] dst_strd 73* Destination stride 74* 75* @param[in] wgt0 76* weight to be multiplied to the source 77* 78* @param[in] off0 79* offset to be added after rounding and 80* 81* @param[in] shifting 82* 83* 84* @param[in] shift 85* (14 Bit depth) + log2_weight_denominator 86* 87* @param[in] lvl_shift 88* added before shift and offset 89* 90* @param[in] ht 91* height of the source 92* 93* @param[in] wd 94* width of the source 95* 96* @returns 97* 98* @remarks 99* None 100* 101******************************************************************************* 102*/ 103 104void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src, 105 UWORD8 *pu1_dst, 106 WORD32 src_strd, 107 WORD32 dst_strd, 108 WORD32 wgt0, 109 WORD32 off0, 110 WORD32 shift, 111 WORD32 lvl_shift, 112 WORD32 ht, 113 WORD32 wd) 114{ 115 WORD32 row, col; 116 int16x4_t pi2_src_val1; 117 int16x4_t pi2_src_val2; 118 int32x4_t i4_tmp1_t; 119 int32x4_t i4_tmp2_t; 120 int32x4_t sto_res_tmp1; 121 uint16x4_t sto_res_tmp2; 122 uint16x8_t sto_res_tmp3; 123 uint8x8_t sto_res; 124 int32x4_t tmp_lvl_shift_t; 125 WORD32 tmp_shift = 0 - shift; 126 int32x4_t tmp_shift_t; 127 WORD16 *pi2_src_tmp; 128 UWORD8 *pu1_dst_tmp; 129 130 WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift); 131 tmp_lvl_shift += (1 << (shift - 1)); 132 tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift); 133 tmp_shift_t = vmovq_n_s32(tmp_shift); 134 135 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ 136 /* height has also been unrolled, hence 2 rows will processed at a time */ 137 /* store also has been taken care for two row process */ 138 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ 139 /* saturated and narrowed */ 140 141 for(row = ht; row > 0; row -= 2) 142 { 143 for(col = wd; col > 0; col -= 4) 144 { 145 pi2_src_tmp = pi2_src + src_strd; 146 147 pu1_dst_tmp = pu1_dst + dst_strd; 148 149 pi2_src_val1 = vld1_s16((int16_t *)pi2_src); 150 pi2_src += 4; 151 152 pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp); 153 i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0); 154 155 i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t); 156 i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0); 157 158 sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t); 159 i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t); 160 161 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 162 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 163 164 sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t); 165 sto_res = vqmovn_u16(sto_res_tmp3); 166 167 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 168 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 169 170 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 171 pu1_dst += 4; 172 173 sto_res = vqmovn_u16(sto_res_tmp3); 174 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); 175 } 176 pi2_src += 2 * src_strd - wd; 177 pu1_dst += 2 * dst_strd - wd; 178 } 179} 180//WEIGHTED_PRED_UNI 181 182/** 183******************************************************************************* 184* 185* @brief 186* Chroma uni-weighted prediction on the array pointed by pi2_src and stores 187* it at the location pointed by pi2_dst Assumptions : The function is 188* optimized considering the fact Width and height are multiple of 2. 189* 190* @par Description: 191* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + 192* offset 193* 194* @param[in] pi2_src 195* Pointer to the source 196* 197* @param[out] pu1_dst 198* Pointer to the destination 199* 200* @param[in] src_strd 201* Source stride 202* 203* @param[in] dst_strd 204* Destination stride 205* 206* @param[in] wgt0 207* weight to be multiplied to the source 208* 209* @param[in] off0 210* offset to be added after rounding and 211* 212* @param[in] shifting 213* 214* 215* @param[in] shift 216* (14 Bit depth) + log2_weight_denominator 217* 218* @param[in] lvl_shift 219* added before shift and offset 220* 221* @param[in] ht 222* height of the source 223* 224* @param[in] wd 225* width of the source 226* 227* @returns 228* 229* @remarks 230* None 231* 232******************************************************************************* 233*/ 234 235void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src, 236 UWORD8 *pu1_dst, 237 WORD32 src_strd, 238 WORD32 dst_strd, 239 WORD32 wgt0_cb, 240 WORD32 wgt0_cr, 241 WORD32 off0_cb, 242 WORD32 off0_cr, 243 WORD32 shift, 244 WORD32 lvl_shift, 245 WORD32 ht, 246 WORD32 wd) 247{ 248 WORD32 row, col; 249 int16x4_t pi2_src_val1; 250 int16x4_t pi2_src_val2; 251 int32x4_t i4_tmp1_t; 252 int32x4_t i4_tmp2_t; 253 int32x4_t sto_res_tmp1; 254 uint16x4_t sto_res_tmp2; 255 uint16x8_t sto_res_tmp3; 256 uint8x8_t sto_res; 257 int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v; 258 int32x4x2_t tmp_lvl_shift_t; 259 WORD32 tmp_shift = 0 - shift; 260 int32x4_t tmp_shift_t; 261 int16x4_t tmp_wgt0_u, tmp_wgt0_v; 262 int16x4x2_t wgt0; 263 WORD16 *pi2_src_tmp; 264 UWORD8 *pu1_dst_tmp; 265 266 WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift); 267 tmp_lvl_shift += (1 << (shift - 1)); 268 tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift); 269 270 tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift); 271 tmp_lvl_shift += (1 << (shift - 1)); 272 tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift); 273 274 tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v); 275 276 tmp_shift_t = vmovq_n_s32(tmp_shift); 277 278 tmp_wgt0_u = vdup_n_s16(wgt0_cb); 279 tmp_wgt0_v = vdup_n_s16(wgt0_cr); 280 wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v); 281 282 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ 283 /* height has also been unrolled, hence 2 rows will processed at a time */ 284 /* store also has been taken care for two row process */ 285 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ 286 /* saturated and narrowed */ 287 288 for(row = ht; row > 0; row -= 2) 289 { 290 for(col = 2 * wd; col > 0; col -= 4) 291 { 292 pi2_src_tmp = pi2_src + src_strd; 293 294 pu1_dst_tmp = pu1_dst + dst_strd; 295 296 pi2_src_val1 = vld1_s16((int16_t *)pi2_src); 297 pi2_src += 4; 298 299 pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp); 300 i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]); 301 302 i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]); 303 i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]); 304 305 sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t); 306 i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]); 307 308 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 309 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 310 311 sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t); 312 sto_res = vqmovn_u16(sto_res_tmp3); 313 314 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 315 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 316 317 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 318 pu1_dst += 4; 319 320 sto_res = vqmovn_u16(sto_res_tmp3); 321 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); 322 } 323 pi2_src += 2 * src_strd - 2 * wd; 324 pu1_dst += 2 * dst_strd - 2 * wd; 325 } 326} 327//WEIGHTED_PRED_CHROMA_UNI 328 329/** 330******************************************************************************* 331* 332* @brief 333* Does bi-weighted prediction on the arrays pointed by pi2_src1 and 334* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The 335* function is optimized considering the fact Width and height are multiple 336* of 2. 337* 338* @par Description: 339* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + 340* off1 + 1) << (shift - 1) ) >> shift 341* 342* @param[in] pi2_src1 343* Pointer to source 1 344* 345* @param[in] pi2_src2 346* Pointer to source 2 347* 348* @param[out] pu1_dst 349* Pointer to destination 350* 351* @param[in] src_strd1 352* Source stride 1 353* 354* @param[in] src_strd2 355* Source stride 2 356* 357* @param[in] dst_strd 358* Destination stride 359* 360* @param[in] wgt0 361* weight to be multiplied to source 1 362* 363* @param[in] off0 364* offset 0 365* 366* @param[in] wgt1 367* weight to be multiplied to source 2 368* 369* @param[in] off1 370* offset 1 371* 372* @param[in] shift 373* (14 Bit depth) + log2_weight_denominator 374* 375* @param[in] lvl_shift1 376* added before shift and offset 377* 378* @param[in] lvl_shift2 379* added before shift and offset 380* 381* @param[in] ht 382* height of the source 383* 384* @param[in] wd 385* width of the source 386* 387* @returns 388* 389* @remarks 390* None 391* 392******************************************************************************* 393*/ 394 395void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1, 396 WORD16 *pi2_src2, 397 UWORD8 *pu1_dst, 398 WORD32 src_strd1, 399 WORD32 src_strd2, 400 WORD32 dst_strd, 401 WORD32 wgt0, 402 WORD32 off0, 403 WORD32 wgt1, 404 WORD32 off1, 405 WORD32 shift, 406 WORD32 lvl_shift1, 407 WORD32 lvl_shift2, 408 WORD32 ht, 409 WORD32 wd) 410{ 411 WORD32 row, col; 412 int16x4_t pi2_src1_val1; 413 int16x4_t pi2_src1_val2; 414 int16x4_t pi2_src2_val1; 415 int16x4_t pi2_src2_val2; 416 int32x4_t i4_tmp1_t1; 417 int32x4_t i4_tmp1_t2; 418 int32x4_t i4_tmp2_t1; 419 int32x4_t i4_tmp2_t2; 420 int32x4_t sto_res_tmp1; 421 uint16x4_t sto_res_tmp2; 422 uint16x8_t sto_res_tmp3; 423 uint8x8_t sto_res; 424 int32x4_t tmp_lvl_shift_t; 425 WORD32 tmp_shift = 0 - shift; 426 int32x4_t tmp_shift_t; 427 WORD16 *pi2_src_tmp1; 428 WORD16 *pi2_src_tmp2; 429 UWORD8 *pu1_dst_tmp; 430 431 WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1); 432 tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1)); 433 tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift); 434 tmp_shift_t = vmovq_n_s32(tmp_shift); 435 436 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ 437 /* height has also been unrolled, hence 2 rows will processed at a time */ 438 /* store also has been taken care for two row process */ 439 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ 440 /* saturated and narrowed */ 441 442 for(row = ht; row > 0; row -= 2) 443 { 444 for(col = wd; col > 0; col -= 4) 445 { 446 pi2_src_tmp1 = pi2_src1 + src_strd1; 447 pi2_src_tmp2 = pi2_src2 + src_strd2; 448 449 pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1); 450 pi2_src1 += 4; 451 pu1_dst_tmp = pu1_dst + dst_strd; 452 453 pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2); 454 pi2_src2 += 4; 455 i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0); 456 457 pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1); 458 i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1); 459 460 pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2); 461 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2); 462 463 i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0); 464 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t); 465 466 i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1); 467 sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t); 468 469 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2); 470 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 471 472 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t); 473 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 474 475 sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t); 476 sto_res = vqmovn_u16(sto_res_tmp3); 477 478 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 479 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 480 481 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 482 pu1_dst += 4; 483 484 sto_res = vqmovn_u16(sto_res_tmp3); 485 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); 486 } 487 pi2_src1 += 2 * src_strd1 - wd; 488 pi2_src2 += 2 * src_strd2 - wd; 489 pu1_dst += 2 * dst_strd - wd; 490 } 491} 492//WEIGHTED_PRED_BI 493 494/** 495******************************************************************************* 496* 497* @brief 498* Chroma bi-weighted prediction on the arrays pointed by pi2_src1 and 499* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The 500* function is optimized considering the fact Width and height are multiple 501* of 2. 502* 503* @par Description: 504* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + 505* off1 + 1) << (shift - 1) ) >> shift 506* 507* @param[in] pi2_src1 508* Pointer to source 1 509* 510* @param[in] pi2_src2 511* Pointer to source 2 512* 513* @param[out] pu1_dst 514* Pointer to destination 515* 516* @param[in] src_strd1 517* Source stride 1 518* 519* @param[in] src_strd2 520* Source stride 2 521* 522* @param[in] dst_strd 523* Destination stride 524* 525* @param[in] wgt0 526* weight to be multiplied to source 1 527* 528* @param[in] off0 529* offset 0 530* 531* @param[in] wgt1 532* weight to be multiplied to source 2 533* 534* @param[in] off1 535* offset 1 536* 537* @param[in] shift 538* (14 Bit depth) + log2_weight_denominator 539* 540* @param[in] lvl_shift1 541* added before shift and offset 542* 543* @param[in] lvl_shift2 544* added before shift and offset 545* 546* @param[in] ht 547* height of the source 548* 549* @param[in] wd 550* width of the source 551* 552* @returns 553* 554* @remarks 555* None 556* 557******************************************************************************* 558*/ 559 560void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1, 561 WORD16 *pi2_src2, 562 UWORD8 *pu1_dst, 563 WORD32 src_strd1, 564 WORD32 src_strd2, 565 WORD32 dst_strd, 566 WORD32 wgt0_cb, 567 WORD32 wgt0_cr, 568 WORD32 off0_cb, 569 WORD32 off0_cr, 570 WORD32 wgt1_cb, 571 WORD32 wgt1_cr, 572 WORD32 off1_cb, 573 WORD32 off1_cr, 574 WORD32 shift, 575 WORD32 lvl_shift1, 576 WORD32 lvl_shift2, 577 WORD32 ht, 578 WORD32 wd) 579{ 580 WORD32 row, col; 581 int16x4_t pi2_src1_val1; 582 int16x4_t pi2_src1_val2; 583 int16x4_t pi2_src2_val1; 584 int16x4_t pi2_src2_val2; 585 int32x4_t i4_tmp1_t1; 586 int32x4_t i4_tmp1_t2; 587 int32x4_t i4_tmp2_t1; 588 int32x4_t i4_tmp2_t2; 589 int32x4_t sto_res_tmp1; 590 uint16x4_t sto_res_tmp2; 591 uint16x8_t sto_res_tmp3; 592 uint8x8_t sto_res; 593 int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v; 594 int32x4x2_t tmp_lvl_shift_t; 595 WORD32 tmp_shift = 0 - shift; 596 int32x4_t tmp_shift_t; 597 int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v; 598 int16x4x2_t wgt0, wgt1; 599 WORD16 *pi2_src_tmp1; 600 WORD16 *pi2_src_tmp2; 601 UWORD8 *pu1_dst_tmp; 602 603 WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb); 604 tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1)); 605 tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift); 606 607 tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr); 608 tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1)); 609 tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift); 610 611 tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v); 612 613 tmp_shift_t = vmovq_n_s32(tmp_shift); 614 615 tmp_wgt0_u = vdup_n_s16(wgt0_cb); 616 tmp_wgt0_v = vdup_n_s16(wgt0_cr); 617 wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v); 618 tmp_wgt1_u = vdup_n_s16(wgt1_cb); 619 tmp_wgt1_v = vdup_n_s16(wgt1_cr); 620 wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v); 621 622 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ 623 /* height has also been unrolled, hence 2 rows will processed at a time */ 624 /* store also has been taken care for two row process */ 625 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ 626 /* saturated and narrowed */ 627 628 for(row = ht; row > 0; row -= 2) 629 { 630 for(col = 2 * wd; col > 0; col -= 4) 631 { 632 pi2_src_tmp1 = pi2_src1 + src_strd1; 633 pi2_src_tmp2 = pi2_src2 + src_strd2; 634 635 pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1); 636 pi2_src1 += 4; 637 pu1_dst_tmp = pu1_dst + dst_strd; 638 639 pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2); 640 pi2_src2 += 4; 641 i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]); 642 643 pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1); 644 i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]); 645 646 pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2); 647 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2); 648 649 i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]); 650 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]); 651 652 i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]); 653 sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t); 654 655 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2); 656 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 657 658 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]); 659 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 660 661 sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t); 662 sto_res = vqmovn_u16(sto_res_tmp3); 663 664 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 665 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 666 667 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 668 pu1_dst += 4; 669 670 sto_res = vqmovn_u16(sto_res_tmp3); 671 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); 672 } 673 pi2_src1 += 2 * src_strd1 - 2 * wd; 674 pi2_src2 += 2 * src_strd2 - 2 * wd; 675 pu1_dst += 2 * dst_strd - 2 * wd; 676 } 677} 678//WEIGHTED_PRED_CHROMA_BI 679 680/** 681******************************************************************************* 682* 683* @brief 684* Does default bi-weighted prediction on the arrays pointed by pi2_src1 and 685* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The 686* function is optimized considering the fact Width and height are multiple 687* of 2. 688* 689* @par Description: 690* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) 691* >> shift where shift = 15 - BitDepth 692* 693* @param[in] pi2_src1 694* Pointer to source 1 695* 696* @param[in] pi2_src2 697* Pointer to source 2 698* 699* @param[out] pu1_dst 700* Pointer to destination 701* 702* @param[in] src_strd1 703* Source stride 1 704* 705* @param[in] src_strd2 706* Source stride 2 707* 708* @param[in] dst_strd 709* Destination stride 710* 711* @param[in] lvl_shift1 712* added before shift and offset 713* 714* @param[in] lvl_shift2 715* added before shift and offset 716* 717* @param[in] ht 718* height of the source 719* 720* @param[in] wd 721* width of the source 722* 723* @returns 724* 725* @remarks 726* None 727* 728******************************************************************************* 729*/ 730 731void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1, 732 WORD16 *pi2_src2, 733 UWORD8 *pu1_dst, 734 WORD32 src_strd1, 735 WORD32 src_strd2, 736 WORD32 dst_strd, 737 WORD32 lvl_shift1, 738 WORD32 lvl_shift2, 739 WORD32 ht, 740 WORD32 wd) 741{ 742 WORD32 row, col; 743 int16x4_t pi2_src1_val1; 744 int16x4_t pi2_src1_val2; 745 int16x4_t pi2_src2_val1; 746 int16x4_t pi2_src2_val2; 747 int32x4_t i4_tmp1_t1; 748 int32x4_t i4_tmp1_t2; 749 int32x4_t i4_tmp2_t1; 750 int32x4_t i4_tmp2_t2; 751 int32x4_t sto_res_tmp1; 752 uint16x4_t sto_res_tmp2; 753 uint16x8_t sto_res_tmp3; 754 uint8x8_t sto_res; 755 int32x4_t tmp_lvl_shift_t; 756 int32x4_t tmp_shift_t; 757 WORD16 *pi2_src_tmp1; 758 WORD16 *pi2_src_tmp2; 759 UWORD8 *pu1_dst_tmp; 760 WORD32 shift; 761 762 shift = SHIFT_14_MINUS_BIT_DEPTH + 1; 763 WORD32 tmp_shift = 0 - shift; 764 WORD32 tmp_lvl_shift = 1 << (shift - 1); 765 tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift); 766 tmp_shift_t = vmovq_n_s32(tmp_shift); 767 768 int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1); 769 int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2); 770 771 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ 772 /* height has also been unrolled, hence 2 rows will processed at a time */ 773 /* store also has been taken care for two row process */ 774 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ 775 /* saturated and narrowed */ 776 777 for(row = ht; row > 0; row -= 2) 778 { 779 for(col = wd; col > 0; col -= 4) 780 { 781 pi2_src_tmp1 = pi2_src1 + src_strd1; 782 pi2_src_tmp2 = pi2_src2 + src_strd2; 783 784 pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1); 785 pi2_src1 += 4; 786 pu1_dst_tmp = pu1_dst + dst_strd; 787 788 pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2); 789 pi2_src2 += 4; 790 i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t); 791 792 pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1); 793 i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t); 794 795 pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2); 796 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2); 797 798 i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t); 799 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t); 800 801 i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t); 802 sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t); 803 804 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2); 805 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 806 807 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t); 808 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 809 810 sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t); 811 sto_res = vqmovn_u16(sto_res_tmp3); 812 813 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 814 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 815 816 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 817 pu1_dst += 4; 818 819 sto_res = vqmovn_u16(sto_res_tmp3); 820 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); 821 } 822 pi2_src1 += 2 * src_strd1 - wd; 823 pi2_src2 += 2 * src_strd2 - wd; 824 pu1_dst += 2 * dst_strd - wd; 825 } 826} 827//WEIGHTED_PRED_BI_DEFAULT 828 829/** 830******************************************************************************* 831* 832* @brief 833* Does default bi-weighted prediction on the arrays pointed by pi2_src1 and 834* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The 835* function is optimized considering the fact Width and height are multiple 836* of 2. 837* 838* @par Description: 839* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) 840* >> shift where shift = 15 - BitDepth 841* 842* @param[in] pi2_src1 843* Pointer to source 1 844* 845* @param[in] pi2_src2 846* Pointer to source 2 847* 848* @param[out] pu1_dst 849* Pointer to destination 850* 851* @param[in] src_strd1 852* Source stride 1 853* 854* @param[in] src_strd2 855* Source stride 2 856* 857* @param[in] dst_strd 858* Destination stride 859* 860* @param[in] lvl_shift1 861* added before shift and offset 862* 863* @param[in] lvl_shift2 864* added before shift and offset 865* 866* @param[in] ht 867* height of the source 868* 869* @param[in] wd 870* width of the source 871* 872* @returns 873* 874* @remarks 875* None 876* 877******************************************************************************* 878*/ 879 880void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1, 881 WORD16 *pi2_src2, 882 UWORD8 *pu1_dst, 883 WORD32 src_strd1, 884 WORD32 src_strd2, 885 WORD32 dst_strd, 886 WORD32 lvl_shift1, 887 WORD32 lvl_shift2, 888 WORD32 ht, 889 WORD32 wd) 890{ 891 WORD32 row, col; 892 int16x4_t pi2_src1_val1; 893 int16x4_t pi2_src1_val2; 894 int16x4_t pi2_src2_val1; 895 int16x4_t pi2_src2_val2; 896 int32x4_t i4_tmp1_t1; 897 int32x4_t i4_tmp1_t2; 898 int32x4_t i4_tmp2_t1; 899 int32x4_t i4_tmp2_t2; 900 int32x4_t sto_res_tmp1; 901 uint16x4_t sto_res_tmp2; 902 uint16x8_t sto_res_tmp3; 903 uint8x8_t sto_res; 904 int32x4_t tmp_lvl_shift_t; 905 int32x4_t tmp_shift_t; 906 WORD16 *pi2_src_tmp1; 907 WORD16 *pi2_src_tmp2; 908 UWORD8 *pu1_dst_tmp; 909 WORD32 shift; 910 WORD32 tmp_shift; 911 WORD32 tmp_lvl_shift; 912 int16x4_t lvl_shift1_t; 913 int16x4_t lvl_shift2_t; 914 shift = SHIFT_14_MINUS_BIT_DEPTH + 1; 915 tmp_shift = 0 - shift; 916 tmp_lvl_shift = 1 << (shift - 1); 917 tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift); 918 tmp_shift_t = vmovq_n_s32(tmp_shift); 919 920 lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1); 921 lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2); 922 923 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ 924 /* height has also been unrolled, hence 2 rows will processed at a time */ 925 /* store also has been taken care for two row process */ 926 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ 927 /* saturated and narrowed */ 928 929 for(row = ht; row > 0; row -= 2) 930 { 931 for(col = 2 * wd; col > 0; col -= 4) 932 { 933 pi2_src_tmp1 = pi2_src1 + src_strd1; 934 pi2_src_tmp2 = pi2_src2 + src_strd2; 935 936 pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1); 937 pi2_src1 += 4; 938 pu1_dst_tmp = pu1_dst + dst_strd; 939 940 pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2); 941 pi2_src2 += 4; 942 i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t); 943 944 pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1); 945 i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t); 946 947 pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2); 948 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2); 949 950 i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t); 951 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t); 952 953 i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t); 954 sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t); 955 956 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2); 957 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 958 959 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t); 960 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 961 962 sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t); 963 sto_res = vqmovn_u16(sto_res_tmp3); 964 965 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 966 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 967 968 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 969 pu1_dst += 4; 970 971 sto_res = vqmovn_u16(sto_res_tmp3); 972 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); 973 } 974 pi2_src1 += 2 * src_strd1 - 2 * wd; 975 pi2_src2 += 2 * src_strd2 - 2 * wd; 976 pu1_dst += 2 * dst_strd - 2 * wd; 977 } 978} 979//WEIGHTED_PRED_CHROMA_BI_DEFAULT 980