10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/******************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  ihevc_weighted_pred_neon_intr.c
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Contains function definitions for weighted prediction used in inter
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* prediction
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @author
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Parthiban V
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par List of Functions:
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_weighted_pred_uni()
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_weighted_pred_bi()
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_weighted_pred_bi_default()
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* File Includes                                                             */
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h"
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h"
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h"
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h"
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_inter_pred.h"
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "arm_neon.h"
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Does uni-weighted prediction on the array pointed by  pi2_src and stores
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* it at the location pointed by pi2_dst Assumptions : The function is
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* optimized considering the fact Width and  height are multiple of 2.
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* offset
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to the source
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to the destination
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Destination stride
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt0
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  weight to be multiplied to the source
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off0
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  offset to be added after rounding and
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shifting
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shift
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  (14 Bit depth) + log2_weight_denominator
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  height of the source
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  width of the source
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src,
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      UWORD8 *pu1_dst,
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 src_strd,
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 dst_strd,
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 wgt0,
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 off0,
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 shift,
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 lvl_shift,
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 ht,
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 wd)
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col;
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src_val1;
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src_val2;
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp1_t;
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp2_t;
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t sto_res_tmp1;
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint16x4_t sto_res_tmp2;
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint16x8_t sto_res_tmp3;
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint8x8_t sto_res;
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t tmp_lvl_shift_t;
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tmp_shift = 0 - shift;
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t tmp_shift_t;
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *pi2_src_tmp;
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UWORD8 *pu1_dst_tmp;
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift);
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift += (1 << (shift - 1));
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_shift_t = vmovq_n_s32(tmp_shift);
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* height has also been unrolled, hence 2 rows will processed at a time                     */
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* store also has been taken care for two row process                                       */
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* saturated and narrowed                                                                   */
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    for(row = ht; row > 0; row -= 2)
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(col = wd; col > 0; col -= 4)
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_tmp = pi2_src + src_strd;
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst_tmp = pu1_dst + dst_strd;
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src += 4;
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0);
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t);
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0);
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t);
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res = vqmovn_u16(sto_res_tmp3);
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 4;
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res = vqmovn_u16(sto_res_tmp3);
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_src += 2 * src_strd - wd;
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pu1_dst += 2 * dst_strd - wd;
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//WEIGHTED_PRED_UNI
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Chroma uni-weighted prediction on the array pointed by  pi2_src and stores
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* it at the location pointed by pi2_dst Assumptions : The function is
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* optimized considering the fact Width and  height are multiple of 2.
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* offset
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to the source
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to the destination
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Destination stride
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt0
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  weight to be multiplied to the source
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off0
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  offset to be added after rounding and
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shifting
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shift
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  (14 Bit depth) + log2_weight_denominator
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  height of the source
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  width of the source
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src,
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             UWORD8 *pu1_dst,
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 src_strd,
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 dst_strd,
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 wgt0_cb,
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 wgt0_cr,
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 off0_cb,
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 off0_cr,
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 shift,
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 lvl_shift,
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 ht,
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 wd)
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col;
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src_val1;
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src_val2;
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp1_t;
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp2_t;
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t sto_res_tmp1;
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint16x4_t sto_res_tmp2;
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint16x8_t sto_res_tmp3;
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint8x8_t sto_res;
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4x2_t tmp_lvl_shift_t;
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tmp_shift = 0 - shift;
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t tmp_shift_t;
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t tmp_wgt0_u, tmp_wgt0_v;
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4x2_t wgt0;
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *pi2_src_tmp;
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UWORD8 *pu1_dst_tmp;
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift);
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift += (1 << (shift - 1));
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift);
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift += (1 << (shift - 1));
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_shift_t = vmovq_n_s32(tmp_shift);
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_wgt0_u = vdup_n_s16(wgt0_cb);
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_wgt0_v = vdup_n_s16(wgt0_cr);
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* height has also been unrolled, hence 2 rows will processed at a time                     */
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* store also has been taken care for two row process                                       */
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* saturated and narrowed                                                                   */
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    for(row = ht; row > 0; row -= 2)
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(col = 2 * wd; col > 0; col -= 4)
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_tmp = pi2_src + src_strd;
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst_tmp = pu1_dst + dst_strd;
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src += 4;
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]);
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]);
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]);
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]);
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res = vqmovn_u16(sto_res_tmp3);
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 4;
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res = vqmovn_u16(sto_res_tmp3);
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_src += 2 * src_strd - 2 * wd;
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pu1_dst += 2 * dst_strd - 2 * wd;
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//WEIGHTED_PRED_CHROMA_UNI
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* function is optimized considering the fact Width and  height are multiple
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* of 2.
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* off1 + 1) << (shift - 1) ) >> shift
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src1
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 1
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src2
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 2
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to destination
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd1
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 1
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd2
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 2
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Destination stride
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt0
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  weight to be multiplied to source 1
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off0
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  offset 0
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt1
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  weight to be multiplied to source 2
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off1
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  offset 1
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shift
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  (14 Bit depth) + log2_weight_denominator
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift1
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift2
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  height of the source
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  width of the source
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1,
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD16 *pi2_src2,
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     UWORD8 *pu1_dst,
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 src_strd1,
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 src_strd2,
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 dst_strd,
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 wgt0,
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 off0,
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 wgt1,
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 off1,
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 shift,
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 lvl_shift1,
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 lvl_shift2,
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 ht,
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 wd)
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col;
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src1_val1;
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src1_val2;
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src2_val1;
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src2_val2;
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp1_t1;
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp1_t2;
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp2_t1;
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp2_t2;
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t sto_res_tmp1;
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint16x4_t sto_res_tmp2;
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint16x8_t sto_res_tmp3;
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint8x8_t sto_res;
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t tmp_lvl_shift_t;
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tmp_shift = 0 - shift;
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t tmp_shift_t;
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *pi2_src_tmp1;
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *pi2_src_tmp2;
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UWORD8 *pu1_dst_tmp;
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1);
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1));
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_shift_t = vmovq_n_s32(tmp_shift);
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* height has also been unrolled, hence 2 rows will processed at a time                     */
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* store also has been taken care for two row process                                       */
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* saturated and narrowed                                                                   */
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    for(row = ht; row > 0; row -= 2)
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(col = wd; col > 0; col -= 4)
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_tmp1 = pi2_src1 + src_strd1;
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_tmp2 = pi2_src2 + src_strd2;
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1 += 4;
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst_tmp = pu1_dst + dst_strd;
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2 += 4;
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0);
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1);
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0);
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1);
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res = vqmovn_u16(sto_res_tmp3);
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 4;
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res = vqmovn_u16(sto_res_tmp3);
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_src1 += 2 * src_strd1 - wd;
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_src2 += 2 * src_strd2 - wd;
4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pu1_dst += 2 * dst_strd - wd;
4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//WEIGHTED_PRED_BI
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* function is optimized considering the fact Width and  height are multiple
5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* of 2.
5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* off1 + 1) << (shift - 1) ) >> shift
5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src1
5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 1
5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src2
5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 2
5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to destination
5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd1
5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 1
5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd2
5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 2
5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Destination stride
5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt0
5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  weight to be multiplied to source 1
5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off0
5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  offset 0
5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt1
5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  weight to be multiplied to source 2
5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off1
5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  offset 1
5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shift
5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  (14 Bit depth) + log2_weight_denominator
5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift1
5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift2
5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  height of the source
5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  width of the source
5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1,
5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD16 *pi2_src2,
5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            UWORD8 *pu1_dst,
5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 src_strd1,
5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 src_strd2,
5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 dst_strd,
5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 wgt0_cb,
5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 wgt0_cr,
5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 off0_cb,
5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 off0_cr,
5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 wgt1_cb,
5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 wgt1_cr,
5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 off1_cb,
5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 off1_cr,
5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 shift,
5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 lvl_shift1,
5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 lvl_shift2,
5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 ht,
5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            WORD32 wd)
5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col;
5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src1_val1;
5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src1_val2;
5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src2_val1;
5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src2_val2;
5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp1_t1;
5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp1_t2;
5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp2_t1;
5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp2_t2;
5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t sto_res_tmp1;
5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint16x4_t sto_res_tmp2;
5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint16x8_t sto_res_tmp3;
5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint8x8_t sto_res;
5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4x2_t tmp_lvl_shift_t;
5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tmp_shift = 0 - shift;
5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t tmp_shift_t;
5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v;
5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4x2_t wgt0, wgt1;
5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *pi2_src_tmp1;
6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *pi2_src_tmp2;
6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UWORD8 *pu1_dst_tmp;
6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb);
6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1));
6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr);
6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1));
6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_shift_t = vmovq_n_s32(tmp_shift);
6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_wgt0_u = vdup_n_s16(wgt0_cb);
6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_wgt0_v = vdup_n_s16(wgt0_cr);
6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_wgt1_u = vdup_n_s16(wgt1_cb);
6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_wgt1_v = vdup_n_s16(wgt1_cr);
6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v);
6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* height has also been unrolled, hence 2 rows will processed at a time                     */
6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* store also has been taken care for two row process                                       */
6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* saturated and narrowed                                                                   */
6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    for(row = ht; row > 0; row -= 2)
6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(col = 2 * wd; col > 0; col -= 4)
6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_tmp1 = pi2_src1 + src_strd1;
6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_tmp2 = pi2_src2 + src_strd2;
6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1 += 4;
6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst_tmp = pu1_dst + dst_strd;
6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2 += 4;
6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]);
6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]);
6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]);
6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]);
6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]);
6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]);
6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res = vqmovn_u16(sto_res_tmp3);
6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 4;
6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res = vqmovn_u16(sto_res_tmp3);
6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_src1 += 2 * src_strd1 - 2 * wd;
6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_src2 += 2 * src_strd2 - 2 * wd;
6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pu1_dst += 2 * dst_strd - 2 * wd;
6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//WEIGHTED_PRED_CHROMA_BI
6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* function is optimized considering the fact Width and  height are multiple
6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* of 2.
6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* >> shift  where shift = 15 - BitDepth
6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src1
6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 1
6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src2
6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 2
6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to destination
7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd1
7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 1
7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd2
7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 2
7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Destination stride
7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift1
7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift2
7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  height of the source
7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  width of the source
7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1,
7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD16 *pi2_src2,
7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             UWORD8 *pu1_dst,
7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 src_strd1,
7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 src_strd2,
7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 dst_strd,
7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 lvl_shift1,
7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 lvl_shift2,
7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 ht,
7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 wd)
7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col;
7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src1_val1;
7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src1_val2;
7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src2_val1;
7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src2_val2;
7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp1_t1;
7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp1_t2;
7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp2_t1;
7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp2_t2;
7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t sto_res_tmp1;
7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint16x4_t sto_res_tmp2;
7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint16x8_t sto_res_tmp3;
7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint8x8_t sto_res;
7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t tmp_lvl_shift_t;
7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t tmp_shift_t;
7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *pi2_src_tmp1;
7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *pi2_src_tmp2;
7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UWORD8 *pu1_dst_tmp;
7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 shift;
7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tmp_shift = 0 - shift;
7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tmp_lvl_shift = 1 << (shift - 1);
7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_shift_t = vmovq_n_s32(tmp_shift);
7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* height has also been unrolled, hence 2 rows will processed at a time                     */
7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* store also has been taken care for two row process                                       */
7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* saturated and narrowed                                                                   */
7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    for(row = ht; row > 0; row -= 2)
7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(col = wd; col > 0; col -= 4)
7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_tmp1 = pi2_src1 + src_strd1;
7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_tmp2 = pi2_src2 + src_strd2;
7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1 += 4;
7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst_tmp = pu1_dst + dst_strd;
7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2 += 4;
7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res = vqmovn_u16(sto_res_tmp3);
8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 4;
8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res = vqmovn_u16(sto_res_tmp3);
8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_src1 += 2 * src_strd1 - wd;
8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_src2 += 2 * src_strd2 - wd;
8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pu1_dst += 2 * dst_strd - wd;
8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//WEIGHTED_PRED_BI_DEFAULT
8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* function is optimized considering the fact Width and  height are multiple
8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* of 2.
8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* >> shift  where shift = 15 - BitDepth
8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src1
8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 1
8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src2
8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 2
8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to destination
8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd1
8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 1
8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd2
8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 2
8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Destination stride
8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift1
8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift2
8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  height of the source
8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  width of the source
8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1,
8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                    WORD16 *pi2_src2,
8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                    UWORD8 *pu1_dst,
8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                    WORD32 src_strd1,
8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                    WORD32 src_strd2,
8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                    WORD32 dst_strd,
8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                    WORD32 lvl_shift1,
8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                    WORD32 lvl_shift2,
8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                    WORD32 ht,
8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                    WORD32 wd)
8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col;
8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src1_val1;
8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src1_val2;
8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src2_val1;
8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t pi2_src2_val2;
8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp1_t1;
8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp1_t2;
8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp2_t1;
8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t i4_tmp2_t2;
9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t sto_res_tmp1;
9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint16x4_t sto_res_tmp2;
9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint16x8_t sto_res_tmp3;
9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uint8x8_t sto_res;
9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t tmp_lvl_shift_t;
9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int32x4_t tmp_shift_t;
9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *pi2_src_tmp1;
9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *pi2_src_tmp2;
9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UWORD8 *pu1_dst_tmp;
9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 shift;
9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tmp_shift;
9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tmp_lvl_shift;
9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t lvl_shift1_t;
9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    int16x4_t lvl_shift2_t;
9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_shift = 0 - shift;
9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift = 1 << (shift - 1);
9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_shift_t = vmovq_n_s32(tmp_shift);
9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* height has also been unrolled, hence 2 rows will processed at a time                     */
9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* store also has been taken care for two row process                                       */
9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* saturated and narrowed                                                                   */
9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    for(row = ht; row > 0; row -= 2)
9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(col = 2 * wd; col > 0; col -= 4)
9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_tmp1 = pi2_src1 + src_strd1;
9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_tmp2 = pi2_src2 + src_strd2;
9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1 += 4;
9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst_tmp = pu1_dst + dst_strd;
9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2 += 4;
9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
9520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
9570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
9610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res = vqmovn_u16(sto_res_tmp3);
9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
9690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 4;
9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            sto_res = vqmovn_u16(sto_res_tmp3);
9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_src1 += 2 * src_strd1 - 2 * wd;
9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_src2 += 2 * src_strd2 - 2 * wd;
9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pu1_dst += 2 * dst_strd - 2 * wd;
9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//WEIGHTED_PRED_CHROMA_BI_DEFAULT
980