1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19*******************************************************************************
20* @file
21*  ihevc_weighted_pred_neon_intr.c
22*
23* @brief
24*  Contains function definitions for weighted prediction used in inter
25* prediction
26*
27* @author
28*  Parthiban V
29*
30* @par List of Functions:
31*  - ihevc_weighted_pred_uni()
32*  - ihevc_weighted_pred_bi()
33*  - ihevc_weighted_pred_bi_default()
34*
35* @remarks
36*  None
37*
38*******************************************************************************
39*/
40/*****************************************************************************/
41/* File Includes                                                             */
42/*****************************************************************************/
43#include "ihevc_typedefs.h"
44#include "ihevc_defs.h"
45#include "ihevc_macros.h"
46#include "ihevc_func_selector.h"
47#include "ihevc_inter_pred.h"
48#include "arm_neon.h"
49
50
51/**
52*******************************************************************************
53*
54* @brief
55*  Does uni-weighted prediction on the array pointed by  pi2_src and stores
56* it at the location pointed by pi2_dst Assumptions : The function is
57* optimized considering the fact Width and  height are multiple of 2.
58*
59* @par Description:
60*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
61* offset
62*
63* @param[in] pi2_src
64*  Pointer to the source
65*
66* @param[out] pu1_dst
67*  Pointer to the destination
68*
69* @param[in] src_strd
70*  Source stride
71*
72* @param[in] dst_strd
73*  Destination stride
74*
75* @param[in] wgt0
76*  weight to be multiplied to the source
77*
78* @param[in] off0
79*  offset to be added after rounding and
80*
81* @param[in] shifting
82*
83*
84* @param[in] shift
85*  (14 Bit depth) + log2_weight_denominator
86*
87* @param[in] lvl_shift
88*  added before shift and offset
89*
90* @param[in] ht
91*  height of the source
92*
93* @param[in] wd
94*  width of the source
95*
96* @returns
97*
98* @remarks
99*  None
100*
101*******************************************************************************
102*/
103
104void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src,
105                                      UWORD8 *pu1_dst,
106                                      WORD32 src_strd,
107                                      WORD32 dst_strd,
108                                      WORD32 wgt0,
109                                      WORD32 off0,
110                                      WORD32 shift,
111                                      WORD32 lvl_shift,
112                                      WORD32 ht,
113                                      WORD32 wd)
114{
115    WORD32 row, col;
116    int16x4_t pi2_src_val1;
117    int16x4_t pi2_src_val2;
118    int32x4_t i4_tmp1_t;
119    int32x4_t i4_tmp2_t;
120    int32x4_t sto_res_tmp1;
121    uint16x4_t sto_res_tmp2;
122    uint16x8_t sto_res_tmp3;
123    uint8x8_t sto_res;
124    int32x4_t tmp_lvl_shift_t;
125    WORD32 tmp_shift = 0 - shift;
126    int32x4_t tmp_shift_t;
127    WORD16 *pi2_src_tmp;
128    UWORD8 *pu1_dst_tmp;
129
130    WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift);
131    tmp_lvl_shift += (1 << (shift - 1));
132    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
133    tmp_shift_t = vmovq_n_s32(tmp_shift);
134
135    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
136    /* height has also been unrolled, hence 2 rows will processed at a time                     */
137    /* store also has been taken care for two row process                                       */
138    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
139    /* saturated and narrowed                                                                   */
140
141    for(row = ht; row > 0; row -= 2)
142    {
143        for(col = wd; col > 0; col -= 4)
144        {
145            pi2_src_tmp = pi2_src + src_strd;
146
147            pu1_dst_tmp = pu1_dst + dst_strd;
148
149            pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
150            pi2_src += 4;
151
152            pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
153            i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0);
154
155            i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t);
156            i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0);
157
158            sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
159            i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t);
160
161            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
162            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
163
164            sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
165            sto_res = vqmovn_u16(sto_res_tmp3);
166
167            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
168            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
169
170            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
171            pu1_dst += 4;
172
173            sto_res = vqmovn_u16(sto_res_tmp3);
174            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
175        }
176        pi2_src += 2 * src_strd - wd;
177        pu1_dst += 2 * dst_strd - wd;
178    }
179}
180//WEIGHTED_PRED_UNI
181
182/**
183*******************************************************************************
184*
185* @brief
186* Chroma uni-weighted prediction on the array pointed by  pi2_src and stores
187* it at the location pointed by pi2_dst Assumptions : The function is
188* optimized considering the fact Width and  height are multiple of 2.
189*
190* @par Description:
191*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
192* offset
193*
194* @param[in] pi2_src
195*  Pointer to the source
196*
197* @param[out] pu1_dst
198*  Pointer to the destination
199*
200* @param[in] src_strd
201*  Source stride
202*
203* @param[in] dst_strd
204*  Destination stride
205*
206* @param[in] wgt0
207*  weight to be multiplied to the source
208*
209* @param[in] off0
210*  offset to be added after rounding and
211*
212* @param[in] shifting
213*
214*
215* @param[in] shift
216*  (14 Bit depth) + log2_weight_denominator
217*
218* @param[in] lvl_shift
219*  added before shift and offset
220*
221* @param[in] ht
222*  height of the source
223*
224* @param[in] wd
225*  width of the source
226*
227* @returns
228*
229* @remarks
230*  None
231*
232*******************************************************************************
233*/
234
235void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src,
236                                             UWORD8 *pu1_dst,
237                                             WORD32 src_strd,
238                                             WORD32 dst_strd,
239                                             WORD32 wgt0_cb,
240                                             WORD32 wgt0_cr,
241                                             WORD32 off0_cb,
242                                             WORD32 off0_cr,
243                                             WORD32 shift,
244                                             WORD32 lvl_shift,
245                                             WORD32 ht,
246                                             WORD32 wd)
247{
248    WORD32 row, col;
249    int16x4_t pi2_src_val1;
250    int16x4_t pi2_src_val2;
251    int32x4_t i4_tmp1_t;
252    int32x4_t i4_tmp2_t;
253    int32x4_t sto_res_tmp1;
254    uint16x4_t sto_res_tmp2;
255    uint16x8_t sto_res_tmp3;
256    uint8x8_t sto_res;
257    int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
258    int32x4x2_t tmp_lvl_shift_t;
259    WORD32 tmp_shift = 0 - shift;
260    int32x4_t tmp_shift_t;
261    int16x4_t tmp_wgt0_u, tmp_wgt0_v;
262    int16x4x2_t wgt0;
263    WORD16 *pi2_src_tmp;
264    UWORD8 *pu1_dst_tmp;
265
266    WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift);
267    tmp_lvl_shift += (1 << (shift - 1));
268    tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
269
270    tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift);
271    tmp_lvl_shift += (1 << (shift - 1));
272    tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
273
274    tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
275
276    tmp_shift_t = vmovq_n_s32(tmp_shift);
277
278    tmp_wgt0_u = vdup_n_s16(wgt0_cb);
279    tmp_wgt0_v = vdup_n_s16(wgt0_cr);
280    wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
281
282    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
283    /* height has also been unrolled, hence 2 rows will processed at a time                     */
284    /* store also has been taken care for two row process                                       */
285    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
286    /* saturated and narrowed                                                                   */
287
288    for(row = ht; row > 0; row -= 2)
289    {
290        for(col = 2 * wd; col > 0; col -= 4)
291        {
292            pi2_src_tmp = pi2_src + src_strd;
293
294            pu1_dst_tmp = pu1_dst + dst_strd;
295
296            pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
297            pi2_src += 4;
298
299            pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
300            i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]);
301
302            i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]);
303            i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]);
304
305            sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
306            i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]);
307
308            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
309            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
310
311            sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
312            sto_res = vqmovn_u16(sto_res_tmp3);
313
314            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
315            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
316
317            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
318            pu1_dst += 4;
319
320            sto_res = vqmovn_u16(sto_res_tmp3);
321            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
322        }
323        pi2_src += 2 * src_strd - 2 * wd;
324        pu1_dst += 2 * dst_strd - 2 * wd;
325    }
326}
327//WEIGHTED_PRED_CHROMA_UNI
328
329/**
330*******************************************************************************
331*
332* @brief
333*  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
334* pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
335* function is optimized considering the fact Width and  height are multiple
336* of 2.
337*
338* @par Description:
339*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
340* off1 + 1) << (shift - 1) ) >> shift
341*
342* @param[in] pi2_src1
343*  Pointer to source 1
344*
345* @param[in] pi2_src2
346*  Pointer to source 2
347*
348* @param[out] pu1_dst
349*  Pointer to destination
350*
351* @param[in] src_strd1
352*  Source stride 1
353*
354* @param[in] src_strd2
355*  Source stride 2
356*
357* @param[in] dst_strd
358*  Destination stride
359*
360* @param[in] wgt0
361*  weight to be multiplied to source 1
362*
363* @param[in] off0
364*  offset 0
365*
366* @param[in] wgt1
367*  weight to be multiplied to source 2
368*
369* @param[in] off1
370*  offset 1
371*
372* @param[in] shift
373*  (14 Bit depth) + log2_weight_denominator
374*
375* @param[in] lvl_shift1
376*  added before shift and offset
377*
378* @param[in] lvl_shift2
379*  added before shift and offset
380*
381* @param[in] ht
382*  height of the source
383*
384* @param[in] wd
385*  width of the source
386*
387* @returns
388*
389* @remarks
390*  None
391*
392*******************************************************************************
393*/
394
395void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1,
396                                     WORD16 *pi2_src2,
397                                     UWORD8 *pu1_dst,
398                                     WORD32 src_strd1,
399                                     WORD32 src_strd2,
400                                     WORD32 dst_strd,
401                                     WORD32 wgt0,
402                                     WORD32 off0,
403                                     WORD32 wgt1,
404                                     WORD32 off1,
405                                     WORD32 shift,
406                                     WORD32 lvl_shift1,
407                                     WORD32 lvl_shift2,
408                                     WORD32 ht,
409                                     WORD32 wd)
410{
411    WORD32 row, col;
412    int16x4_t pi2_src1_val1;
413    int16x4_t pi2_src1_val2;
414    int16x4_t pi2_src2_val1;
415    int16x4_t pi2_src2_val2;
416    int32x4_t i4_tmp1_t1;
417    int32x4_t i4_tmp1_t2;
418    int32x4_t i4_tmp2_t1;
419    int32x4_t i4_tmp2_t2;
420    int32x4_t sto_res_tmp1;
421    uint16x4_t sto_res_tmp2;
422    uint16x8_t sto_res_tmp3;
423    uint8x8_t sto_res;
424    int32x4_t tmp_lvl_shift_t;
425    WORD32 tmp_shift = 0 - shift;
426    int32x4_t tmp_shift_t;
427    WORD16 *pi2_src_tmp1;
428    WORD16 *pi2_src_tmp2;
429    UWORD8 *pu1_dst_tmp;
430
431    WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1);
432    tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1));
433    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
434    tmp_shift_t = vmovq_n_s32(tmp_shift);
435
436    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
437    /* height has also been unrolled, hence 2 rows will processed at a time                     */
438    /* store also has been taken care for two row process                                       */
439    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
440    /* saturated and narrowed                                                                   */
441
442    for(row = ht; row > 0; row -= 2)
443    {
444        for(col = wd; col > 0; col -= 4)
445        {
446            pi2_src_tmp1 = pi2_src1 + src_strd1;
447            pi2_src_tmp2 = pi2_src2 + src_strd2;
448
449            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
450            pi2_src1 += 4;
451            pu1_dst_tmp = pu1_dst + dst_strd;
452
453            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
454            pi2_src2 += 4;
455            i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0);
456
457            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
458            i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1);
459
460            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
461            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
462
463            i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0);
464            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
465
466            i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1);
467            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
468
469            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
470            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
471
472            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
473            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
474
475            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
476            sto_res = vqmovn_u16(sto_res_tmp3);
477
478            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
479            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
480
481            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
482            pu1_dst += 4;
483
484            sto_res = vqmovn_u16(sto_res_tmp3);
485            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
486        }
487        pi2_src1 += 2 * src_strd1 - wd;
488        pi2_src2 += 2 * src_strd2 - wd;
489        pu1_dst += 2 * dst_strd - wd;
490    }
491}
492//WEIGHTED_PRED_BI
493
494/**
495*******************************************************************************
496*
497* @brief
498*  Chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
499* pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
500* function is optimized considering the fact Width and  height are multiple
501* of 2.
502*
503* @par Description:
504*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
505* off1 + 1) << (shift - 1) ) >> shift
506*
507* @param[in] pi2_src1
508*  Pointer to source 1
509*
510* @param[in] pi2_src2
511*  Pointer to source 2
512*
513* @param[out] pu1_dst
514*  Pointer to destination
515*
516* @param[in] src_strd1
517*  Source stride 1
518*
519* @param[in] src_strd2
520*  Source stride 2
521*
522* @param[in] dst_strd
523*  Destination stride
524*
525* @param[in] wgt0
526*  weight to be multiplied to source 1
527*
528* @param[in] off0
529*  offset 0
530*
531* @param[in] wgt1
532*  weight to be multiplied to source 2
533*
534* @param[in] off1
535*  offset 1
536*
537* @param[in] shift
538*  (14 Bit depth) + log2_weight_denominator
539*
540* @param[in] lvl_shift1
541*  added before shift and offset
542*
543* @param[in] lvl_shift2
544*  added before shift and offset
545*
546* @param[in] ht
547*  height of the source
548*
549* @param[in] wd
550*  width of the source
551*
552* @returns
553*
554* @remarks
555*  None
556*
557*******************************************************************************
558*/
559
560void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1,
561                                            WORD16 *pi2_src2,
562                                            UWORD8 *pu1_dst,
563                                            WORD32 src_strd1,
564                                            WORD32 src_strd2,
565                                            WORD32 dst_strd,
566                                            WORD32 wgt0_cb,
567                                            WORD32 wgt0_cr,
568                                            WORD32 off0_cb,
569                                            WORD32 off0_cr,
570                                            WORD32 wgt1_cb,
571                                            WORD32 wgt1_cr,
572                                            WORD32 off1_cb,
573                                            WORD32 off1_cr,
574                                            WORD32 shift,
575                                            WORD32 lvl_shift1,
576                                            WORD32 lvl_shift2,
577                                            WORD32 ht,
578                                            WORD32 wd)
579{
580    WORD32 row, col;
581    int16x4_t pi2_src1_val1;
582    int16x4_t pi2_src1_val2;
583    int16x4_t pi2_src2_val1;
584    int16x4_t pi2_src2_val2;
585    int32x4_t i4_tmp1_t1;
586    int32x4_t i4_tmp1_t2;
587    int32x4_t i4_tmp2_t1;
588    int32x4_t i4_tmp2_t2;
589    int32x4_t sto_res_tmp1;
590    uint16x4_t sto_res_tmp2;
591    uint16x8_t sto_res_tmp3;
592    uint8x8_t sto_res;
593    int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
594    int32x4x2_t tmp_lvl_shift_t;
595    WORD32 tmp_shift = 0 - shift;
596    int32x4_t tmp_shift_t;
597    int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v;
598    int16x4x2_t wgt0, wgt1;
599    WORD16 *pi2_src_tmp1;
600    WORD16 *pi2_src_tmp2;
601    UWORD8 *pu1_dst_tmp;
602
603    WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb);
604    tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1));
605    tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
606
607    tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr);
608    tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1));
609    tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
610
611    tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
612
613    tmp_shift_t = vmovq_n_s32(tmp_shift);
614
615    tmp_wgt0_u = vdup_n_s16(wgt0_cb);
616    tmp_wgt0_v = vdup_n_s16(wgt0_cr);
617    wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
618    tmp_wgt1_u = vdup_n_s16(wgt1_cb);
619    tmp_wgt1_v = vdup_n_s16(wgt1_cr);
620    wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v);
621
622    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
623    /* height has also been unrolled, hence 2 rows will processed at a time                     */
624    /* store also has been taken care for two row process                                       */
625    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
626    /* saturated and narrowed                                                                   */
627
628    for(row = ht; row > 0; row -= 2)
629    {
630        for(col = 2 * wd; col > 0; col -= 4)
631        {
632            pi2_src_tmp1 = pi2_src1 + src_strd1;
633            pi2_src_tmp2 = pi2_src2 + src_strd2;
634
635            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
636            pi2_src1 += 4;
637            pu1_dst_tmp = pu1_dst + dst_strd;
638
639            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
640            pi2_src2 += 4;
641            i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]);
642
643            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
644            i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]);
645
646            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
647            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
648
649            i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]);
650            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]);
651
652            i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]);
653            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
654
655            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
656            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
657
658            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]);
659            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
660
661            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
662            sto_res = vqmovn_u16(sto_res_tmp3);
663
664            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
665            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
666
667            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
668            pu1_dst += 4;
669
670            sto_res = vqmovn_u16(sto_res_tmp3);
671            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
672        }
673        pi2_src1 += 2 * src_strd1 - 2 * wd;
674        pi2_src2 += 2 * src_strd2 - 2 * wd;
675        pu1_dst += 2 * dst_strd - 2 * wd;
676    }
677}
678//WEIGHTED_PRED_CHROMA_BI
679
680/**
681*******************************************************************************
682*
683* @brief
684*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
685* pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
686* function is optimized considering the fact Width and  height are multiple
687* of 2.
688*
689* @par Description:
690*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
691* >> shift  where shift = 15 - BitDepth
692*
693* @param[in] pi2_src1
694*  Pointer to source 1
695*
696* @param[in] pi2_src2
697*  Pointer to source 2
698*
699* @param[out] pu1_dst
700*  Pointer to destination
701*
702* @param[in] src_strd1
703*  Source stride 1
704*
705* @param[in] src_strd2
706*  Source stride 2
707*
708* @param[in] dst_strd
709*  Destination stride
710*
711* @param[in] lvl_shift1
712*  added before shift and offset
713*
714* @param[in] lvl_shift2
715*  added before shift and offset
716*
717* @param[in] ht
718*  height of the source
719*
720* @param[in] wd
721*  width of the source
722*
723* @returns
724*
725* @remarks
726*  None
727*
728*******************************************************************************
729*/
730
731void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1,
732                                             WORD16 *pi2_src2,
733                                             UWORD8 *pu1_dst,
734                                             WORD32 src_strd1,
735                                             WORD32 src_strd2,
736                                             WORD32 dst_strd,
737                                             WORD32 lvl_shift1,
738                                             WORD32 lvl_shift2,
739                                             WORD32 ht,
740                                             WORD32 wd)
741{
742    WORD32 row, col;
743    int16x4_t pi2_src1_val1;
744    int16x4_t pi2_src1_val2;
745    int16x4_t pi2_src2_val1;
746    int16x4_t pi2_src2_val2;
747    int32x4_t i4_tmp1_t1;
748    int32x4_t i4_tmp1_t2;
749    int32x4_t i4_tmp2_t1;
750    int32x4_t i4_tmp2_t2;
751    int32x4_t sto_res_tmp1;
752    uint16x4_t sto_res_tmp2;
753    uint16x8_t sto_res_tmp3;
754    uint8x8_t sto_res;
755    int32x4_t tmp_lvl_shift_t;
756    int32x4_t tmp_shift_t;
757    WORD16 *pi2_src_tmp1;
758    WORD16 *pi2_src_tmp2;
759    UWORD8 *pu1_dst_tmp;
760    WORD32 shift;
761
762    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
763    WORD32 tmp_shift = 0 - shift;
764    WORD32 tmp_lvl_shift = 1 << (shift - 1);
765    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
766    tmp_shift_t = vmovq_n_s32(tmp_shift);
767
768    int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
769    int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
770
771    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
772    /* height has also been unrolled, hence 2 rows will processed at a time                     */
773    /* store also has been taken care for two row process                                       */
774    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
775    /* saturated and narrowed                                                                   */
776
777    for(row = ht; row > 0; row -= 2)
778    {
779        for(col = wd; col > 0; col -= 4)
780        {
781            pi2_src_tmp1 = pi2_src1 + src_strd1;
782            pi2_src_tmp2 = pi2_src2 + src_strd2;
783
784            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
785            pi2_src1 += 4;
786            pu1_dst_tmp = pu1_dst + dst_strd;
787
788            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
789            pi2_src2 += 4;
790            i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
791
792            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
793            i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
794
795            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
796            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
797
798            i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
799            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
800
801            i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
802            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
803
804            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
805            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
806
807            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
808            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
809
810            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
811            sto_res = vqmovn_u16(sto_res_tmp3);
812
813            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
814            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
815
816            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
817            pu1_dst += 4;
818
819            sto_res = vqmovn_u16(sto_res_tmp3);
820            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
821        }
822        pi2_src1 += 2 * src_strd1 - wd;
823        pi2_src2 += 2 * src_strd2 - wd;
824        pu1_dst += 2 * dst_strd - wd;
825    }
826}
827//WEIGHTED_PRED_BI_DEFAULT
828
829/**
830*******************************************************************************
831*
832* @brief
833*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
834* pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
835* function is optimized considering the fact Width and  height are multiple
836* of 2.
837*
838* @par Description:
839*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
840* >> shift  where shift = 15 - BitDepth
841*
842* @param[in] pi2_src1
843*  Pointer to source 1
844*
845* @param[in] pi2_src2
846*  Pointer to source 2
847*
848* @param[out] pu1_dst
849*  Pointer to destination
850*
851* @param[in] src_strd1
852*  Source stride 1
853*
854* @param[in] src_strd2
855*  Source stride 2
856*
857* @param[in] dst_strd
858*  Destination stride
859*
860* @param[in] lvl_shift1
861*  added before shift and offset
862*
863* @param[in] lvl_shift2
864*  added before shift and offset
865*
866* @param[in] ht
867*  height of the source
868*
869* @param[in] wd
870*  width of the source
871*
872* @returns
873*
874* @remarks
875*  None
876*
877*******************************************************************************
878*/
879
880void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1,
881                                                    WORD16 *pi2_src2,
882                                                    UWORD8 *pu1_dst,
883                                                    WORD32 src_strd1,
884                                                    WORD32 src_strd2,
885                                                    WORD32 dst_strd,
886                                                    WORD32 lvl_shift1,
887                                                    WORD32 lvl_shift2,
888                                                    WORD32 ht,
889                                                    WORD32 wd)
890{
891    WORD32 row, col;
892    int16x4_t pi2_src1_val1;
893    int16x4_t pi2_src1_val2;
894    int16x4_t pi2_src2_val1;
895    int16x4_t pi2_src2_val2;
896    int32x4_t i4_tmp1_t1;
897    int32x4_t i4_tmp1_t2;
898    int32x4_t i4_tmp2_t1;
899    int32x4_t i4_tmp2_t2;
900    int32x4_t sto_res_tmp1;
901    uint16x4_t sto_res_tmp2;
902    uint16x8_t sto_res_tmp3;
903    uint8x8_t sto_res;
904    int32x4_t tmp_lvl_shift_t;
905    int32x4_t tmp_shift_t;
906    WORD16 *pi2_src_tmp1;
907    WORD16 *pi2_src_tmp2;
908    UWORD8 *pu1_dst_tmp;
909    WORD32 shift;
910    WORD32 tmp_shift;
911    WORD32 tmp_lvl_shift;
912    int16x4_t lvl_shift1_t;
913    int16x4_t lvl_shift2_t;
914    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
915    tmp_shift = 0 - shift;
916    tmp_lvl_shift = 1 << (shift - 1);
917    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
918    tmp_shift_t = vmovq_n_s32(tmp_shift);
919
920    lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
921    lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
922
923    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
924    /* height has also been unrolled, hence 2 rows will processed at a time                     */
925    /* store also has been taken care for two row process                                       */
926    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
927    /* saturated and narrowed                                                                   */
928
929    for(row = ht; row > 0; row -= 2)
930    {
931        for(col = 2 * wd; col > 0; col -= 4)
932        {
933            pi2_src_tmp1 = pi2_src1 + src_strd1;
934            pi2_src_tmp2 = pi2_src2 + src_strd2;
935
936            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
937            pi2_src1 += 4;
938            pu1_dst_tmp = pu1_dst + dst_strd;
939
940            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
941            pi2_src2 += 4;
942            i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
943
944            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
945            i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
946
947            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
948            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
949
950            i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
951            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
952
953            i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
954            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
955
956            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
957            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
958
959            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
960            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
961
962            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
963            sto_res = vqmovn_u16(sto_res_tmp3);
964
965            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
966            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
967
968            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
969            pu1_dst += 4;
970
971            sto_res = vqmovn_u16(sto_res_tmp3);
972            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
973        }
974        pi2_src1 += 2 * src_strd1 - 2 * wd;
975        pi2_src2 += 2 * src_strd2 - 2 * wd;
976        pu1_dst += 2 * dst_strd - 2 * wd;
977    }
978}
979//WEIGHTED_PRED_CHROMA_BI_DEFAULT
980