1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19*******************************************************************************
20* @file
21*  ihevc_intra_pred_filters_neon_intr.c
22*
23* @brief
24*  Contains function Definition for intra prediction  interpolation filters
25*
26*
27* @author
28*  Yogeswaran RS
29*
30* @par List of Functions:
31*  - ihevc_intra_pred_luma_planar()
32*  - ihevc_intra_pred_luma_dc()
33*  - ihevc_intra_pred_luma_horz()
34*  - ihevc_intra_pred_luma_ver()
35*  - ihevc_intra_pred_luma_mode2()
36*  - ihevc_intra_pred_luma_mode_18_34()
37*
38* @remarks
39*  None
40*
41*******************************************************************************
42*/
43/*****************************************************************************/
44/* File Includes                                                             */
45/*****************************************************************************/
46#include <stdio.h>
47
48#include "ihevc_typedefs.h"
49#include "ihevc_intra_pred.h"
50#include "ihevc_macros.h"
51#include "ihevc_func_selector.h"
52#include "arm_neon.h"
53#include "ihevc_platform_macros.h"
54#include "ihevc_common_tables.h"
55
56/****************************************************************************/
57/* Constant Macros                                                          */
58/****************************************************************************/
59#define MAX_CU_SIZE 64
60#define BIT_DEPTH 8
61#define T32_4NT 128
62#define T16_4NT 64
63
64
65
66/*****************************************************************************/
67/* Table Look-up                                                             */
68/*****************************************************************************/
69
70#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
71
72/*****************************************************************************/
73/* Function Definition                                                      */
74/*****************************************************************************/
75
76/**
77*******************************************************************************
78*
79* @brief
80 *    Intra prediction interpolation filter for pu1_ref substitution
81 *
82 *
83 * @par Description:
84 *    Reference substitution process for samples unavailable  for prediction
85 *    Refer to section 8.4.4.2.2
86 *
87 * @param[in] pu1_top_left
88 *  UWORD8 pointer to the top-left
89 *
90 * @param[in] pu1_top
91 *  UWORD8 pointer to the top
92 *
93 * @param[in] pu1_left
94 *  UWORD8 pointer to the left
95 *
96 * @param[in] src_strd
97 *  WORD32 Source stride
98 *
99 * @param[in] nbr_flags
100 *  WORD32 neighbor availability flags
101 *
102 * @param[in] nt
103 *  WORD32 transform Block size
104 *
105 * @param[in] dst_strd
106 *  WORD32 Destination stride
107 *
108 * @returns
109 *
110 * @remarks
111 *  None
112 *
113 *******************************************************************************
114 */
115
116
117void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
118                                                     UWORD8 *pu1_top,
119                                                     UWORD8 *pu1_left,
120                                                     WORD32 src_strd,
121                                                     WORD32 nt,
122                                                     WORD32 nbr_flags,
123                                                     UWORD8 *pu1_dst,
124                                                     WORD32 dst_strd)
125{
126    UWORD8 pu1_ref;
127    WORD32 dc_val, i;
128    WORD32 total_samples = (4 * nt) + 1;
129    WORD32 two_nt = 2 * nt;
130    WORD32 three_nt = 3 * nt;
131    WORD32 get_bits;
132    WORD32 next;
133    WORD32 bot_left, left, top, tp_right, tp_left;
134    WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
135    UNUSED(dst_strd);
136    dc_val = 1 << (BIT_DEPTH - 1);
137
138    /* Neighbor Flag Structure*/
139    /*    Top-Left | Top-Right | Top | Left | Bottom-Left
140              1         4         4     4         4
141     */
142
143    /* If no neighbor flags are present, fill the neighbor samples with DC value */
144    if(nbr_flags == 0)
145    {
146        for(i = 0; i < total_samples; i++)
147        {
148            pu1_dst[i] = dc_val;
149        }
150    }
151    else
152    {
153        /* Else fill the corresponding samples */
154        pu1_dst[two_nt] = *pu1_top_left;
155        UWORD8 *pu1_dst_tmp2 = pu1_dst;
156        UWORD8 *pu1_top_tmp = pu1_top;
157        pu1_dst_tmp2 += two_nt + 1;
158
159        for(i = 0; i < two_nt; i++)
160            pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
161
162        uint8x8_t src;
163        for(i = two_nt; i > 0; i -= 8)
164        {
165            src = vld1_u8(pu1_top_tmp);
166            pu1_top_tmp += 8;
167            vst1_u8(pu1_dst_tmp2, src);
168            pu1_dst_tmp2 += 8;
169        }
170
171        if(nt <= 8)
172        {
173            /* 1 bit extraction for all the neighboring blocks */
174            tp_left = (nbr_flags & 0x10000) >> 16;
175            bot_left = nbr_flags & 0x1;
176            left = (nbr_flags & 0x10) >> 4;
177            top = (nbr_flags & 0x100) >> 8;
178            tp_right = (nbr_flags & 0x1000) >> 12;
179
180            next = 1;
181
182            /* If bottom -left is not available, reverse substitution process*/
183            if(bot_left == 0)
184            {
185                WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
186
187                /* Check for the 1st available sample from bottom-left*/
188                while(!a_nbr_flag[next])
189                    next++;
190
191                /* If Left, top-left are available*/
192                if(next <= 2)
193                {
194                    idx = nt * next;
195                    pu1_ref = pu1_dst[idx];
196                    for(i = 0; i < idx; i++)
197                        pu1_dst[i] = pu1_ref;
198                }
199                else /* If top, top-right are available */
200                {
201                    /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
202                    idx = (nt * (next - 1)) + 1;
203                    pu1_ref = pu1_dst[idx];
204                    for(i = 0; i < idx; i++)
205                        pu1_dst[i] = pu1_ref;
206                }
207            }
208
209            /* Forward Substitution Process */
210            /* If left is Unavailable, copy the last bottom-left value */
211
212            if(left == 0)
213            {
214                uint8x8_t dup_pu1_dst1;
215                UWORD8 *pu1_dst_const_nt = pu1_dst;
216                pu1_dst_const_nt += nt;
217
218                if(0 == (nt & 7))
219                {
220                    dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
221                    for(i = nt; i > 0; i -= 8)
222                    {
223                        vst1_u8(pu1_dst_const_nt, dup_pu1_dst1);
224                        pu1_dst_const_nt += 8;
225
226                    }
227                }
228                else
229                {
230                    //uint32x2_t dup_pu1_dst4;
231                    dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
232                    //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]);
233                    for(i = nt; i > 0; i -= 4)
234                    {
235                        vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0);
236                        pu1_dst_const_nt += 4;
237
238                    }
239
240                }
241
242            }
243            if(tp_left == 0)
244                pu1_dst[two_nt] = pu1_dst[two_nt - 1];
245            if(top == 0)
246            {
247
248                if(0 == (nt & 7))
249                {
250                    uint8x8_t dup_pu1_dst2;
251                    UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst;
252                    pu1_dst_const_two_nt_1 += (two_nt + 1);
253                    dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]);
254                    for(i = nt; i > 0; i -= 8)
255                    {
256                        vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2);
257                        pu1_dst_const_two_nt_1 += 8;
258
259                    }
260                }
261                else
262                {
263                    for(i = 0; i < nt; i++)
264                        pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
265                }
266            }
267            if(tp_right == 0)
268            {
269                uint8x8_t dup_pu1_dst3;
270                UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst;
271                pu1_dst_const_three_nt_1 += (three_nt + 1);
272                dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]);
273                if(0 == (nt & 7))
274                {
275                    for(i = nt; i > 0; i -= 8)
276                    {
277                        vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3);
278                        pu1_dst_const_three_nt_1 += 8;
279
280                    }
281                }
282                else
283                {
284                    for(i = nt; i > 0; i -= 4)
285                    {
286                        vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0);
287                        pu1_dst_const_three_nt_1 += 4;
288                    }
289
290                }
291
292            }
293        }
294        if(nt == 16)
295        {
296            WORD32 nbr_flags_temp = 0;
297            nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2)
298                            + ((nbr_flags & 0x300) >> 4)
299                            + ((nbr_flags & 0x3000) >> 6)
300                            + ((nbr_flags & 0x10000) >> 8);
301
302            /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
303            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
304            {
305                nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
306
307                if(nbr_id_from_bl == 64)
308                    nbr_id_from_bl = 32;
309
310                if(nbr_id_from_bl == 32)
311                {
312                    /* for top left : 1 pel per nbr bit */
313                    if(!((nbr_flags_temp >> 8) & 0x1))
314                    {
315                        nbr_id_from_bl++;
316                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right;  8 pels per nbr bit */
317                    }
318                }
319                /* Reverse Substitution Process*/
320                if(nbr_id_from_bl)
321                {
322                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
323                    pu1_ref = pu1_dst[nbr_id_from_bl];
324                    for(i = (nbr_id_from_bl - 1); i >= 0; i--)
325                    {
326                        pu1_dst[i] = pu1_ref;
327                    }
328                }
329            }
330
331            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
332            while(nbr_id_from_bl < ((T16_4NT) + 1))
333            {
334                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
335                /* Devide by 8 to obtain the original index */
336                frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
337
338                /* The Top-left flag is at the last bit location of nbr_flags*/
339                if(nbr_id_from_bl == (T16_4NT / 2))
340                {
341                    get_bits = GET_BITS(nbr_flags_temp, 8);
342
343                    /* only pel substitution for TL */
344                    if(!get_bits)
345                        pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
346                }
347                else
348                {
349                    get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
350                    if(!get_bits)
351                    {
352                        /* 8 pel substitution (other than TL) */
353                        pu1_ref = pu1_dst[nbr_id_from_bl - 1];
354                        for(i = 0; i < 8; i++)
355                            pu1_dst[nbr_id_from_bl + i] = pu1_ref;
356                    }
357
358                }
359                nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
360            }
361        }
362
363        if(nt == 32)
364        {
365            /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
366            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
367            {
368                nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
369
370                if(nbr_id_from_bl == 64)
371                {
372                    /* for top left : 1 pel per nbr bit */
373                    if(!((nbr_flags >> 16) & 0x1))
374                    {
375                        /* top left not available */
376                        nbr_id_from_bl++;
377                        /* top and top right;  8 pels per nbr bit */
378                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
379                    }
380                }
381                /* Reverse Substitution Process*/
382                if(nbr_id_from_bl)
383                {
384                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
385                    pu1_ref = pu1_dst[nbr_id_from_bl];
386                    for(i = (nbr_id_from_bl - 1); i >= 0; i--)
387                        pu1_dst[i] = pu1_ref;
388                }
389            }
390
391            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
392            while(nbr_id_from_bl < ((T32_4NT)+1))
393            {
394                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
395                /* Devide by 8 to obtain the original index */
396                frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
397
398                /* The Top-left flag is at the last bit location of nbr_flags*/
399                if(nbr_id_from_bl == (T32_4NT / 2))
400                {
401                    get_bits = GET_BITS(nbr_flags, 16);
402                    /* only pel substitution for TL */
403                    if(!get_bits)
404                        pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
405                }
406                else
407                {
408                    get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
409                    if(!get_bits)
410                    {
411                        /* 8 pel substitution (other than TL) */
412                        pu1_ref = pu1_dst[nbr_id_from_bl - 1];
413                        for(i = 0; i < 8; i++)
414                            pu1_dst[nbr_id_from_bl + i] = pu1_ref;
415                    }
416
417                }
418                nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
419            }
420        }
421
422    }
423
424}
425
426/**
427 *******************************************************************************
428 *
429 * @brief
430 *    Intra prediction interpolation filter for ref_filtering
431 *
432 *
433 * @par Description:
434 *    Reference DC filtering for neighboring samples dependent  on TU size and
435 *    mode  Refer to section 8.4.4.2.3 in the standard
436 *
437 * @param[in] pu1_src
438 *  UWORD8 pointer to the source
439 *
440 * @param[out] pu1_dst
441 *  UWORD8 pointer to the destination
442 *
443 * @param[in] nt
444 *  integer Transform Block size
445 *
446 * @param[in] mode
447 *  integer intraprediction mode
448 *
449 * @returns
450 *
451 * @remarks
452 *  None
453 *
454 *******************************************************************************
455 */
456
457
458void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
459                                             WORD32 nt,
460                                             UWORD8 *pu1_dst,
461                                             WORD32 mode,
462                                             WORD32 strong_intra_smoothing_enable_flag)
463{
464    WORD32 filter_flag;
465    WORD32 i = 0;
466    WORD32 four_nt = 4 * nt;
467
468    WORD32 src_4nt;
469    WORD32 src_0nt;
470    /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1   */
471    /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values        */
472    UWORD8 *pu1_src_tmp_0 = pu1_src;
473    UWORD8 *pu1_src_tmp_1;
474    UWORD8 *pu1_src_tmp_2;
475    UWORD8 *pu1_dst_tmp_0 = pu1_dst;
476    UWORD8 *pu1_dst_tmp_1;
477
478    uint8x8_t src_val_0, src_val_2;
479    uint8x8_t src_val_1, shift_res;
480    uint8x8_t dup_const_2;
481    uint16x8_t mul_res, add_res;
482    WORD32 bi_linear_int_flag = 0;
483    WORD32 abs_cond_left_flag = 0;
484    WORD32 abs_cond_top_flag = 0;
485    WORD32 dc_val = 1 << (BIT_DEPTH - 5);
486    shift_res = vdup_n_u8(0);
487
488    filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
489
490    if(0 == filter_flag)
491    {
492        if(pu1_src == pu1_dst)
493        {
494            return;
495        }
496        else
497        {
498            for(i = four_nt; i > 0; i -= 8)
499            {
500                src_val_0 = vld1_u8(pu1_src_tmp_0);
501                pu1_src_tmp_0 += 8;
502                vst1_u8(pu1_dst_tmp_0, src_val_0);
503                pu1_dst_tmp_0 += 8;
504            }
505            pu1_dst[four_nt] = pu1_src[four_nt];
506        }
507    }
508
509    else
510    {
511        /* If strong intra smoothin is enabled and transform size is 32 */
512        if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
513        {
514            /*Strong Intra Filtering*/
515            abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
516                            - (2 * pu1_src[3 * nt]))) < dc_val;
517            abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
518                            - (2 * pu1_src[nt]))) < dc_val;
519
520            bi_linear_int_flag = ((1 == abs_cond_left_flag)
521                            && (1 == abs_cond_top_flag));
522        }
523
524        src_4nt = pu1_src[4 * nt];
525        src_0nt = pu1_src[0];
526        /* Strong filtering of reference samples */
527        if(1 == bi_linear_int_flag)
528        {
529            WORD32 two_nt = four_nt >> 1;
530
531            WORD32 pu1_src_0_val = pu1_src[0];
532            WORD32 pu1_src_2_nt_val = pu1_src[2 * nt];
533            WORD32 pu1_src_4_nt_val = pu1_src[4 * nt];
534
535            WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val;
536            uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val);
537
538            WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val;
539            uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val);
540
541            const UWORD8 *const_col_i;
542            uint8x8_t const_col_i_val;
543            uint16x8_t prod_val_1;
544            uint16x8_t prod_val_2;
545            uint16x8_t prod_val_3;
546            uint16x8_t prod_val_4;
547            uint8x8_t res_val_1;
548            uint8x8_t res_val_2;
549            uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val);
550            uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val);
551            uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val);
552            pu1_dst_tmp_0 = pu1_dst + 1;
553            pu1_dst_tmp_1 = pu1_dst + two_nt + 1;
554
555            const_col_i = gau1_ihevc_planar_factor + 1;
556
557            for(i = two_nt; i > 0; i -= 8)
558            {
559                const_col_i_val = vld1_u8(const_col_i);
560                const_col_i += 8;
561
562                prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t);
563                prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t);
564
565                res_val_1 = vrshrn_n_u16(prod_val_2, 6);
566                prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t);
567
568                vst1_u8(pu1_dst_tmp_0, res_val_1);
569                pu1_dst_tmp_0 += 8;
570                prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t);
571
572                res_val_2 = vrshrn_n_u16(prod_val_4, 6);
573                vst1_u8(pu1_dst_tmp_1, res_val_2);
574                pu1_dst_tmp_1 += 8;
575            }
576            pu1_dst[2 * nt] = pu1_src[2 * nt];
577        }
578        else
579        {
580            pu1_src_tmp_1 = pu1_src + 1;
581            pu1_src_tmp_2 = pu1_src + 2;
582            pu1_dst_tmp_0 += 1;
583
584            dup_const_2 = vdup_n_u8(2);
585
586            /* Extremities Untouched*/
587            pu1_dst[0] = pu1_src[0];
588
589            /* To avoid the issue when the dest and src has the same pointer this load has been done
590             * outside and the 2nd consecutive load is done before the store of the 1st */
591
592            /* Perform bilinear filtering of Reference Samples */
593            for(i = (four_nt - 1); i > 0; i -= 8)
594            {
595                src_val_0 = vld1_u8(pu1_src_tmp_0);
596                pu1_src_tmp_0 += 8;
597
598                src_val_2 = vld1_u8(pu1_src_tmp_2);
599                pu1_src_tmp_2 += 8;
600
601                src_val_1 = vld1_u8(pu1_src_tmp_1);
602                pu1_src_tmp_1 += 8;
603
604                if(i < four_nt - 1)
605                {
606                    vst1_u8(pu1_dst_tmp_0, shift_res);
607                    pu1_dst_tmp_0 += 8;
608                }
609
610                add_res = vaddl_u8(src_val_0, src_val_2);
611
612                mul_res = vmlal_u8(add_res, src_val_1, dup_const_2);
613                shift_res = vrshrn_n_u16(mul_res, 2);
614
615            }
616            vst1_u8(pu1_dst_tmp_0, shift_res);
617            pu1_dst_tmp_0 += 8;
618        }
619        pu1_dst[4 * nt] = src_4nt;
620        pu1_dst[0] = src_0nt;
621    }
622
623}
624
625
626
627/**
628 *******************************************************************************
629 *
630 * @brief
631*   Intra prediction interpolation filter for luma planar
632*
633* @par Description:
634*      Planar Intraprediction with reference neighboring samples  location
635*      pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'
636*
637* @param[in] pu1_src
638*  UWORD8 pointer to the source
639*
640* @param[out] pu1_dst
641*  UWORD8 pointer to the destination
642*
643* @param[in] src_strd
644*  integer source stride
645*
646* @param[in] dst_strd
647*  integer destination stride
648*
649* @param[in] nt
650*  integer Transform Block size
651*
652* @param[in] wd
653*  integer width of the array
654*
655* @returns
656*
657* @remarks
658*  None
659*
660*******************************************************************************
661*/
662
663void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref,
664                                           WORD32 src_strd,
665                                           UWORD8 *pu1_dst,
666                                           WORD32 dst_strd,
667                                           WORD32 nt,
668                                           WORD32 mode)
669{
670    /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor)   */
671    /* load const_nt_1_col values into a d register                                                 */
672    /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1                                         */
673    /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup                 */
674    /* log2nt + 1 is taken care while assigning the values itself                                   */
675    /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/
676
677    WORD32 row, col = 0;
678    WORD32 log2nt_plus1 = 6;
679    WORD32 two_nt, three_nt;
680    UWORD8 *pu1_ref_two_nt_1;
681    UWORD8 *pu1_dst_tmp;
682    const UWORD8 *const_nt_1_col;
683    uint8x8_t const_nt_1_col_t;
684    const UWORD8 *const_col_1;
685    uint8x8_t const_col_1_t;
686    uint8_t const_nt_1_row;
687    uint8x8_t const_nt_1_row_dup;
688    uint8_t const_row_1;
689    uint8x8_t const_row_1_dup;
690    uint8_t const_nt = nt;
691    uint16x8_t const_nt_dup;
692    uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1];
693    uint8x8_t pu1_ref_nt_1_dup;
694    uint8_t pu1_ref_two_nt_1_row;
695    uint8_t pu1_ref_three_nt_1;
696    uint8x8_t pu1_ref_two_nt_1_row_dup;
697    uint8x8_t pu1_ref_two_nt_1_t;
698    uint8x8_t pu1_ref_three_nt_1_dup;
699    uint16x8_t prod_t1;
700    uint16x8_t prod_t2;
701    uint16x8_t sto_res_tmp;
702    uint8x8_t sto_res;
703    int16x8_t log2nt_dup;
704    UNUSED(src_strd);
705    UNUSED(mode);
706    log2nt_plus1 = 32 - CLZ(nt);
707    two_nt = 2 * nt;
708    three_nt = 3 * nt;
709    /* loops have been unrolld considering the fact width is multiple of 8  */
710    if(0 == (nt & 7))
711    {
712        pu1_dst_tmp = pu1_dst;
713        const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
714
715        const_col_1 = gau1_ihevc_planar_factor + 1;
716        pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
717
718        pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
719        const_nt_dup = vdupq_n_u16(const_nt);
720
721        log2nt_dup = vdupq_n_s16(log2nt_plus1);
722        log2nt_dup = vnegq_s16(log2nt_dup);
723
724        pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
725
726        for(row = 0; row < nt; row++)
727        {
728            pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
729            pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
730
731            const_nt_1_row = nt - 1 - row;
732            const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
733
734            const_row_1 = row + 1;
735            const_row_1_dup = vdup_n_u8(const_row_1);
736
737            const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
738
739            const_col_1 = gau1_ihevc_planar_factor + 1;
740            pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
741
742            for(col = nt; col > 0; col -= 8)
743            {
744                const_nt_1_col_t = vld1_u8(const_nt_1_col);
745                const_nt_1_col -= 8;
746                const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
747
748                const_col_1_t = vld1_u8(const_col_1);
749                const_col_1 += 8;
750                prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
751
752                pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
753                pu1_ref_two_nt_1 += 8;
754                prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
755
756                prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
757                prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
758                prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
759                prod_t1 = vaddq_u16(prod_t1, prod_t2);
760
761                sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
762                sto_res = vmovn_u16(sto_res_tmp);
763                vst1_u8(pu1_dst_tmp, sto_res);
764                pu1_dst_tmp += 8;
765            }
766            pu1_dst_tmp += dst_strd - nt;
767        }
768    }
769    /* loops have been unrolld considering the fact width is multiple of 4  */
770    /* If column is multiple of 4 then height should be multiple of 2       */
771    else
772    {
773        uint8x8_t const_row_1_dup1;
774        uint8x8_t pu1_ref_two_nt_1_t1;
775        uint8x8_t const_nt_1_col_t1;
776        uint8x8_t const_col_1_t1;
777        uint8x8_t pu1_ref_two_nt_1_row_dup1;
778        uint8x8_t const_nt_1_row_dup1;
779
780        pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
781
782        pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
783        const_nt_dup = vdupq_n_u16(const_nt);
784
785        log2nt_dup = vdupq_n_s16(log2nt_plus1);
786        log2nt_dup = vnegq_s16(log2nt_dup);
787
788        pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
789
790        for(row = 0; row < nt; row += 2)
791        {
792            pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
793            pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
794            pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row];
795            pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row);
796            pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4);
797
798            const_nt_1_row = nt - 1 - row;
799            const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
800            const_nt_1_row = nt - 2 - row;
801            const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row);
802            const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4);
803
804            const_row_1 = row + 1;
805            const_row_1_dup = vdup_n_u8(const_row_1);
806            const_row_1 = row + 2;
807            const_row_1_dup1 = vdup_n_u8(const_row_1);
808            const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4);
809
810            const_nt_1_col = gau1_ihevc_planar_factor + nt - 4;
811
812            const_col_1 = gau1_ihevc_planar_factor + 1;
813
814            pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
815
816            for(col = nt; col > 0; col -= 4)
817            {
818                const_nt_1_col_t = vld1_u8(const_nt_1_col);
819                const_nt_1_col -= 4;
820                const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
821
822                const_col_1_t = vld1_u8(const_col_1);
823                const_col_1 += 4;
824                const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32));
825
826                pu1_dst_tmp = pu1_dst;
827                const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4);
828
829                const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32));
830                prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
831
832                pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
833                pu1_ref_two_nt_1 += 4;
834                const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4);
835
836                pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32));
837                prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
838
839                pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4);
840                prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
841
842                prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
843                prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
844                prod_t1 = vaddq_u16(prod_t1, prod_t2);
845
846                sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
847                sto_res = vmovn_u16(sto_res_tmp);
848
849                vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
850                pu1_dst_tmp += dst_strd;
851
852                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
853                pu1_dst += 4;
854            }
855            pu1_dst += 2 * dst_strd - nt;
856        }
857    }
858
859}
860/* INTRA_PRED_LUMA_PLANAR */
861
862/**
863*******************************************************************************
864*
865* @brief
866*    Intra prediction interpolation filter for luma dc
867*
868* @par Description:
869*    Intraprediction for DC mode with reference neighboring  samples location
870*    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
871*
872* @param[in] pu1_src
873*  UWORD8 pointer to the source
874*
875* @param[out] pu1_dst
876*  UWORD8 pointer to the destination
877*
878* @param[in] src_strd
879*  integer source stride
880*
881* @param[in] dst_strd
882*  integer destination stride
883*
884* @param[in] nt
885*  integer Transform Block size
886*
887* @param[in] wd
888*  integer width of the array
889*
890* @returns
891*
892* @remarks
893*  None
894*
895*******************************************************************************
896*/
897
898void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref,
899                                       WORD32 src_strd,
900                                       UWORD8 *pu1_dst,
901                                       WORD32 dst_strd,
902                                       WORD32 nt,
903                                       WORD32 mode)
904{
905    WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0;
906    WORD32 i = 0;
907    WORD32 row = 0, col = 0, col_count;
908    WORD32 log2nt_plus1 = 6;
909    WORD32 two_nt = 0;
910    uint16x8_t ref_load_q;
911    uint16x8_t three_dc_val_t;
912    uint8x8_t sto_res_tmp;
913    uint8x8_t sto_res_tmp1;
914    uint8x8_t sto_res_tmp2;
915    uint8x8_t sto_res_tmp3;
916    uint8x8_t sto_res_tmp4;
917    uint8x8_t dc_val_t;
918
919    UWORD8 *pu1_ref_tmp;
920    UWORD8 *pu1_ref_tmp1;
921    UWORD8 *pu1_dst_tmp;
922    UWORD8 *pu1_dst_tmp1;
923    UWORD8 *pu1_dst_tmp2;
924    UNUSED(src_strd);
925    UNUSED(mode);
926
927    /* log2nt + 1 is taken care while assigning the values itself.          */
928    log2nt_plus1 = 32 - CLZ(nt);
929
930    /* loops have been unrolld considering the fact width is multiple of 8  */
931    if(0 == (nt & 7))
932    {
933        uint8x8_t ref_load1;
934        uint8x8_t ref_load2;
935        uint16x4_t acc_dc_pair1;
936        uint32x2_t acc_dc_pair2;
937        uint64x1_t acc_dc = vdup_n_u64(col);
938
939        two_nt = 2 * nt;
940        pu1_ref_tmp = pu1_ref + nt;
941        pu1_ref_tmp1 = pu1_ref + two_nt + 1;
942
943        for(i = two_nt; i > nt; i -= 8)
944        {
945            ref_load1 = vld1_u8(pu1_ref_tmp);
946            pu1_ref_tmp += 8;
947            acc_dc_pair1 = vpaddl_u8(ref_load1);
948
949            ref_load2 = vld1_u8(pu1_ref_tmp1);
950            pu1_ref_tmp1 += 8;
951
952            acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
953            acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
954
955            acc_dc_pair1 = vpaddl_u8(ref_load2);
956            acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
957            acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
958        }
959
960        dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1);
961        dc_val_t = vdup_n_u8(dc_val);
962        two_dc_val = 2 * dc_val;
963        three_dc_val = 3 * dc_val;
964        three_dc_val += 2;
965
966        three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val);
967        pu1_ref_tmp = pu1_ref + two_nt + 1 + 0;
968        pu1_dst_tmp = pu1_dst;
969
970
971        if(nt == 32)
972        {
973            for(row = 0; row < nt; row++)
974            {
975                for(col = nt; col > 0; col -= 8)
976                {
977                    vst1_u8(pu1_dst_tmp, dc_val_t);
978                    pu1_dst_tmp += 8;
979                }
980                pu1_dst_tmp += dst_strd - nt;
981            }
982        }
983        else
984
985        {
986            for(col = nt; col > 0; col -= 8)
987            {
988                ref_load1 = vld1_u8(pu1_ref_tmp);
989                pu1_ref_tmp += 8;
990                ref_load_q = vmovl_u8(ref_load1);
991                ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
992                ref_load_q = vshrq_n_u16(ref_load_q, 2);
993                sto_res_tmp = vmovn_u16(ref_load_q);
994                vst1_u8(pu1_dst_tmp, sto_res_tmp);
995                pu1_dst_tmp += 8;
996            }
997
998            pu1_ref_tmp = pu1_ref + two_nt - 9;
999            pu1_dst_tmp = pu1_dst + dst_strd;
1000            col_count = nt - 8;
1001
1002            /* Except the first row the remaining rows are done here                            */
1003            /* Both column and row has been unrolled by 8                                       */
1004            /* Store has been taken care for the unrolling                                      */
1005            /* Except the 1st column of the remaining rows(other than 1st row), the values are  */
1006            /* constant hence it is extracted with an constant value and stored                 */
1007            /* If the column is greater than 8, then the remaining values are constant which is */
1008            /* taken care in the inner for loop                                                 */
1009
1010            for(row = nt; row > 0; row -= 8)
1011            {
1012                pu1_dst_tmp1 = pu1_dst_tmp + 8;
1013                ref_load1 = vld1_u8(pu1_ref_tmp);
1014                pu1_ref_tmp -= 8;
1015                ref_load_q = vmovl_u8(ref_load1);
1016                ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
1017                ref_load_q = vshrq_n_u16(ref_load_q, 2);
1018                sto_res_tmp = vmovn_u16(ref_load_q);
1019
1020                sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7);
1021
1022                sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8));
1023                sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1024                vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1025                pu1_dst_tmp += dst_strd;
1026
1027                sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16));
1028                sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1029                vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1030                pu1_dst_tmp += dst_strd;
1031
1032                sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24));
1033                sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1034                vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1035                pu1_dst_tmp += dst_strd;
1036
1037                sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32));
1038                sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7);
1039                vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1040                pu1_dst_tmp += dst_strd;
1041
1042                sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40));
1043                sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1044                vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1045                pu1_dst_tmp += dst_strd;
1046
1047                sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48));
1048                sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1049                vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1050                pu1_dst_tmp += dst_strd;
1051
1052                sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56));
1053                sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1054                vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1055                pu1_dst_tmp += dst_strd;
1056                /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1057                if(row != 8)
1058                    vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1059                pu1_dst_tmp += dst_strd;
1060
1061                for(col = col_count; col > 0; col -= 8)
1062                {
1063                    pu1_dst_tmp2 = pu1_dst_tmp1;
1064                    vst1_u8(pu1_dst_tmp1, dc_val_t);
1065                    pu1_dst_tmp1 += dst_strd;
1066                    vst1_u8(pu1_dst_tmp1, dc_val_t);
1067                    pu1_dst_tmp1 += dst_strd;
1068                    vst1_u8(pu1_dst_tmp1, dc_val_t);
1069                    pu1_dst_tmp1 += dst_strd;
1070                    vst1_u8(pu1_dst_tmp1, dc_val_t);
1071                    pu1_dst_tmp1 += dst_strd;
1072                    vst1_u8(pu1_dst_tmp1, dc_val_t);
1073                    pu1_dst_tmp1 += dst_strd;
1074                    vst1_u8(pu1_dst_tmp1, dc_val_t);
1075                    pu1_dst_tmp1 += dst_strd;
1076                    vst1_u8(pu1_dst_tmp1, dc_val_t);
1077                    pu1_dst_tmp1 += dst_strd;
1078
1079                    /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1080                    if(row != 8)
1081                        vst1_u8(pu1_dst_tmp1, dc_val_t);
1082                    pu1_dst_tmp1 = pu1_dst_tmp2 + 8;
1083                }
1084            }
1085            pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1086        }
1087    }
1088    /* loops have been unrolld considering the fact width is multiple of 4  */
1089    else
1090    {
1091        WORD32 acc_dc;
1092        two_nt = 2 * nt;
1093
1094        acc_dc = 0;
1095        pu1_ref_tmp = pu1_ref + nt + 1;
1096        for(i = nt; i < two_nt; i++)
1097        {
1098            acc_dc += pu1_ref[i];
1099            acc_dc += pu1_ref_tmp[i];
1100        }
1101        dc_val = (acc_dc + nt) >> (log2nt_plus1);
1102        two_dc_val = 2 * dc_val;
1103        three_dc_val = 3 * dc_val;
1104        three_dc_val = three_dc_val + 2;
1105        dc_val_t = vdup_n_u8(dc_val);
1106
1107        if(nt == 32)
1108        {
1109            pu1_dst_tmp = pu1_dst;
1110            for(row = 0; row < nt; row++)
1111            {
1112                for(col = nt; col > 0; col -= 4)
1113                {
1114                    vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1115                    pu1_dst_tmp += 4;
1116                }
1117                pu1_dst_tmp += dst_strd - nt;
1118            }
1119        }
1120        else
1121
1122        {
1123            for(col = 1; col < nt; col++)
1124            {
1125                pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2;
1126            }
1127
1128            pu1_dst_tmp = pu1_dst + dst_strd + 0;
1129            /* Since first row is already updated before, loop count is nt-1 */
1130            for(row = nt - 1; row > 0; row -= 1)
1131            {
1132                for(col = nt; col > 0; col -= 4)
1133                {
1134                    vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1135                    pu1_dst_tmp += 4;
1136                }
1137                pu1_dst_tmp += dst_strd - nt;
1138            }
1139
1140            for(row = 1; row < nt; row++)
1141            {
1142                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2;
1143            }
1144            pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1145        }
1146    }
1147}
1148/* INTRA_PRED_LUMA_DC */
1149
1150/**
1151*******************************************************************************
1152*
1153* @brief
1154 *   Intra prediction interpolation filter for horizontal luma variable.
1155 *
1156 * @par Description:
1157 *   Horizontal intraprediction with reference neighboring  samples location
1158 *   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
1159 *
1160 * @param[in] pu1_src
1161 *  UWORD8 pointer to the source
1162 *
1163 * @param[out] pu1_dst
1164 *  UWORD8 pointer to the destination
1165 *
1166 * @param[in] src_strd
1167 *  integer source stride
1168 *
1169 * @param[in] dst_strd
1170 *  integer destination stride
1171 *
1172 * @param[in] nt
1173 *  integer Transform Block size
1174 *
1175 * @param[in] wd
1176 *  integer width of the array
1177 *
1178 * @returns
1179 *
1180 * @remarks
1181 *  None
1182 *
1183 *******************************************************************************
1184 */
1185
1186void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref,
1187                                         WORD32 src_strd,
1188                                         UWORD8 *pu1_dst,
1189                                         WORD32 dst_strd,
1190                                         WORD32 nt,
1191                                         WORD32 mode)
1192{
1193
1194    WORD32 row, col;
1195    WORD32 two_nt;
1196    UNUSED(src_strd);
1197    UNUSED(mode);
1198
1199    two_nt = 2 * nt;
1200
1201
1202    UWORD8 *pu1_dst_tmp = pu1_dst;
1203    UWORD32 pu1_val;
1204    uint8x8_t pu1_val_two_nt_1_row;
1205    if(nt == 32)
1206    {
1207        pu1_dst_tmp = pu1_dst;
1208        for(row = 0; row < nt; row++)
1209        {
1210            pu1_val = pu1_ref[two_nt - 1 - row];
1211            pu1_val_two_nt_1_row = vdup_n_u8(pu1_val);
1212            for(col = nt; col > 0; col -= 8)
1213            {
1214                vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row);
1215                pu1_dst_tmp += 8;
1216            }
1217            pu1_dst_tmp += dst_strd - nt;
1218        }
1219    }
1220    else
1221
1222
1223    /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/
1224    /* naming of variables made according to the operation(instructions) it performs*/
1225    /* (eg. shift_val which contains the shifted value,                             */
1226    /* add_sat which has add and saturated value)                                   */
1227    /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1228    /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
1229    {
1230        if(0 != (nt & 7))      /* cond for multiple of 4 */
1231        {
1232            UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref;
1233            UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref;
1234            UWORD8 *pu1_dst_4 = pu1_dst;
1235            UWORD8 *pu1_dst_4_tmp = pu1_dst;
1236
1237            uint32x2_t pu1_ref_val1, pu1_ref_val2;
1238            uint8x8_t dup_sub, round_val, dup_val;
1239            uint16x8_t dup_add, sub_val;
1240            int16x8_t shift_val, add_sat;
1241
1242            pu1_ref_val1 = vdup_n_u32(0);
1243            pu1_ref_val2 = vdup_n_u32(0);
1244
1245            dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1246
1247            dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1248
1249            pu1_ref_4_two_nt_plus1 += (two_nt + 1);
1250
1251            pu1_ref_4_two_nt_minus_nt += (two_nt - nt);
1252
1253            for(row = nt; row > 0; row -= 4)
1254            {
1255                for(col = nt; col > 0; col -= 4)
1256                {
1257                    pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0);
1258                    sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub);
1259                    shift_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1260
1261                    add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add));
1262                    round_val = vqmovun_s16(add_sat);
1263                    vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0);
1264                    pu1_dst_4 += dst_strd;
1265
1266                    pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0);
1267                    dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2);
1268                    vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1269                    pu1_dst_4 += dst_strd;
1270
1271                    dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1);
1272                    vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1273                    pu1_dst_4 += dst_strd;
1274
1275                    dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0);
1276                    vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1277                    pu1_dst_4 += dst_strd;
1278
1279
1280                }
1281                /* worst cases */
1282                pu1_ref_4_two_nt_minus_nt += 3;
1283                pu1_ref_4_two_nt_plus1 += 4;
1284                pu1_dst_4 = (pu1_dst_4_tmp + 4);
1285            }
1286
1287        }
1288
1289        /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */
1290        /* naming of variables made according to the operation(instructions) it performs    */
1291        /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1292        /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
1293
1294        else
1295        {
1296            UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1297            UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1298
1299            UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1300            UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd;
1301            UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd;
1302
1303            uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res;
1304            uint16x8_t sub_res, dup_add;
1305            int16x8_t shift_res, add_res;
1306
1307            dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1308            dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1309
1310            pu1_ref_tmp_1 += (two_nt + 1);
1311            pu1_ref_tmp_2 += (two_nt - 1);
1312
1313            for(col = nt; col > 0; col -= 8)
1314            {
1315                src_tmp = vld1_u8(pu1_ref_tmp_1);
1316                pu1_ref_tmp_1 += 8;
1317
1318                sub_res = vsubl_u8(src_tmp, dup_sub);
1319                shift_res  = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1);
1320                add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add));
1321                round_val = vqmovun_s16(add_res);
1322                vst1_u8(pu1_dst_tmp_1, round_val);
1323                pu1_dst_tmp_1 += 8;
1324            }
1325
1326            for(row = nt; row > 0; row -= 8)
1327            {
1328                pu1_ref_tmp_2 -= 8;
1329
1330                src_tmp_1 = vld1_u8(pu1_ref_tmp_2);
1331                rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */
1332
1333                dup_1 = vdup_lane_u8(rev_res, 0);
1334                dup_2 = vdup_lane_u8(rev_res, 1);
1335                dup_3 = vdup_lane_u8(rev_res, 2);
1336                dup_4 = vdup_lane_u8(rev_res, 3);
1337                dup_5 = vdup_lane_u8(rev_res, 4);
1338                dup_6 = vdup_lane_u8(rev_res, 5);
1339                dup_7 = vdup_lane_u8(rev_res, 6);
1340                dup_8 = vdup_lane_u8(rev_res, 7);
1341
1342                for(col = nt; col > 0; col -= 8)
1343                {
1344                    pu1_dst_tmp_2 = pu1_dst_tmp_3;
1345
1346                    vst1_u8(pu1_dst_tmp_2, dup_1);
1347                    pu1_dst_tmp_2 += dst_strd;
1348
1349                    vst1_u8(pu1_dst_tmp_2, dup_2);
1350                    pu1_dst_tmp_2 += dst_strd;
1351
1352                    vst1_u8(pu1_dst_tmp_2, dup_3);
1353                    pu1_dst_tmp_2 += dst_strd;
1354
1355                    vst1_u8(pu1_dst_tmp_2, dup_4);
1356                    pu1_dst_tmp_2 += dst_strd;
1357
1358                    vst1_u8(pu1_dst_tmp_2, dup_5);
1359                    pu1_dst_tmp_2 += dst_strd;
1360
1361                    vst1_u8(pu1_dst_tmp_2, dup_6);
1362                    pu1_dst_tmp_2 += dst_strd;
1363
1364                    vst1_u8(pu1_dst_tmp_2, dup_7);
1365                    pu1_dst_tmp_2 += dst_strd;
1366
1367                    /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1368                    if(row != 8)
1369                        vst1_u8(pu1_dst_tmp_2, dup_8);
1370                    pu1_dst_tmp_2 += dst_strd;
1371
1372                    pu1_dst_tmp_3 += 8;
1373                }
1374                pu1_dst_tmp_2 -= (nt - 8);
1375                pu1_dst_tmp_3 = pu1_dst_tmp_2;
1376            }
1377        }
1378    }
1379}
1380/* INTRA_PRED_LUMA_HORZ */
1381
1382/**
1383*******************************************************************************
1384*
1385* @brief
1386*    Intra prediction interpolation filter for vertical luma variable.
1387*
1388* @par Description:
1389*    Horizontal intraprediction with reference neighboring  samples location
1390*    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
1391*
1392* @param[in] pu1_src
1393*  UWORD8 pointer to the source
1394*
1395* @param[out] pu1_dst
1396*  UWORD8 pointer to the destination
1397*
1398* @param[in] src_strd
1399*  integer source stride
1400*
1401* @param[in] dst_strd
1402*  integer destination stride
1403*
1404* @param[in] nt
1405*  integer Transform Block size
1406*
1407* @param[in] wd
1408*  integer width of the array
1409*
1410* @returns
1411*
1412* @remarks
1413*  None
1414*
1415*******************************************************************************
1416*/
1417
1418void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref,
1419                                        WORD32 src_strd,
1420                                        UWORD8 *pu1_dst,
1421                                        WORD32 dst_strd,
1422                                        WORD32 nt,
1423                                        WORD32 mode)
1424{
1425    WORD32 row, col;
1426    WORD32 two_nt;
1427    UNUSED(src_strd);
1428    UNUSED(mode);
1429
1430    two_nt = 2 * nt;
1431
1432    UWORD8 *pu1_dst_tmp = pu1_dst;
1433    UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1;
1434    uint8x8_t pu1_val_two_nt_1_col;
1435    if(nt == 32)
1436    {
1437        pu1_dst_tmp = pu1_dst;
1438        for(row = 0; row < nt; row++)
1439        {
1440            for(col = nt; col > 0; col -= 8)
1441            {
1442                pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1);
1443                pu1_ref_tmp_1 += 8;
1444                vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col);
1445                pu1_dst_tmp += 8;
1446            }
1447            pu1_ref_tmp_1 -= nt;
1448            pu1_dst_tmp += dst_strd - nt;
1449        }
1450    }
1451    else
1452
1453    {
1454        /* naming of variables made according to the operation(instructions) it performs                    */
1455        /* (eg. shift_val which contains the shifted value,                                                 */
1456        /* add_sat which has add and saturated value)                                                       */
1457        /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1458        /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
1459
1460        if(0 != (nt & 7))
1461        {
1462            WORD32 cond_4 = 0;
1463            UWORD8 *pu1_ref_val1 = pu1_ref;
1464            UWORD8 *pu1_ref_val2 = pu1_ref;
1465            UWORD8 *pu1_ref_val3 = pu1_ref;
1466
1467            UWORD8 *pu1_dst_val1 = pu1_dst;
1468            UWORD8 *pu1_dst_val2 = pu1_dst;
1469            UWORD8 *pu1_dst_val3 = pu1_dst;
1470
1471            uint8x8_t dup_2_sub, round_val, vext_val;
1472            uint16x8_t dup_2_add;
1473            uint32x2_t src_val1, src_val2, src_val3;
1474            uint16x8_t sub_val;
1475            int16x8_t shift_val1, add_sat;
1476            uint64x1_t shift_val2;
1477
1478            src_val1 = vdup_n_u32(0);
1479            src_val2 = vdup_n_u32(0);
1480            src_val3 = vdup_n_u32(0);
1481            pu1_ref_val1 += (two_nt - nt);
1482            pu1_ref_val3 += (two_nt + 2);
1483            pu1_ref_val2 += (two_nt + 1);
1484
1485            dup_2_sub = vdup_n_u8(pu1_ref[two_nt]);
1486            dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1487
1488            /* loops to store the first nt sets of values in the destination */
1489
1490            for(row = nt; row > 0; row -= 4)
1491            {
1492                for(col = nt; (col > 0) && (cond_4 == 0); col -= 4)
1493                {
1494                    /*  unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/
1495                    src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1);
1496                    sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub);
1497                    shift_val1  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1498                    add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add));
1499                    round_val = vqmovun_s16(add_sat);
1500
1501                    /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1502                    src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0);
1503                    vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7);
1504                    vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1505                    pu1_dst_val1 += dst_strd;
1506
1507                    shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1508
1509                    vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1510                    vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1511                    pu1_dst_val1 += dst_strd;
1512
1513                    shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1514
1515                    vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1516                    vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1517                    pu1_dst_val1 += dst_strd;
1518
1519                    shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1520
1521                    vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1522                    vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1523                    pu1_dst_val1 += dst_strd;
1524
1525                    pu1_ref_val1  -= 4;
1526                }
1527
1528                /* loop to store next sets of eight values in the destination */
1529
1530                for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4)
1531                {
1532                    src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0);
1533
1534                    vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1535                    pu1_dst_val2 += dst_strd;
1536
1537                    vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1538                    pu1_dst_val2 += dst_strd;
1539
1540                    vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1541                    pu1_dst_val2 += dst_strd;
1542
1543                    vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1544                    pu1_dst_val2 += dst_strd;
1545                }
1546                pu1_ref_val2 += 4;
1547                pu1_dst_val3 += 4;
1548                pu1_dst_val2 = pu1_dst_val3;
1549                cond_4 = 1;
1550            }
1551        }
1552
1553        /* rows and columns are unrolled by 8, when the width is multiple of 8          */
1554        else
1555        {
1556            WORD32 cond = 0, col_1;
1557            UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1558            UWORD8 *pu1_dst_tmp_2 = pu1_dst;
1559            UWORD8 *pu1_dst_tmp_3 = pu1_dst;
1560
1561            UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1562            UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1563            UWORD8 *pu1_ref_tmp_3 = pu1_ref;
1564
1565            uint8x8_t pu1_src_tmp1;
1566            uint8x8_t pu1_src_tmp2;
1567
1568            uint8x8_t dup_sub;
1569            uint16x8_t dup_add;
1570            int16x8_t subsh_val;
1571            int16x8_t addsat_val;
1572            uint16x8_t sub_val;
1573            uint8x8_t round_val;
1574            uint8x8_t vext_t;
1575            uint64x1_t shift_64;
1576
1577            dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1578            dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1579
1580            pu1_ref_tmp_1 += (two_nt);
1581            pu1_ref_tmp_1 -= 8;
1582            pu1_ref_tmp_2 += (two_nt + 2);
1583            pu1_ref_tmp_3 += (two_nt + 1);
1584
1585            /* loops to store the first nt sets of values in the destination */
1586
1587            for(row = nt; row > 0; row -= 8)
1588            {
1589                for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8)
1590                {
1591                    pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1);
1592
1593                    sub_val = vsubl_u8(pu1_src_tmp1, dup_sub);
1594                    subsh_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1595                    addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add));
1596                    round_val = vqmovun_s16(addsat_val);
1597
1598                    /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1599
1600                    pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2);
1601                    vext_t = vext_u8(round_val, pu1_src_tmp2, 7);
1602                    vst1_u8(pu1_dst_tmp_1, vext_t);
1603                    pu1_dst_tmp_1 += dst_strd;
1604
1605                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1606
1607                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1608                    vst1_u8(pu1_dst_tmp_1, vext_t);
1609                    pu1_dst_tmp_1 += dst_strd;
1610
1611                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1612                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1613                    vst1_u8(pu1_dst_tmp_1, vext_t);
1614                    pu1_dst_tmp_1 += dst_strd;
1615
1616                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1617                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1618                    vst1_u8(pu1_dst_tmp_1, vext_t);
1619                    pu1_dst_tmp_1 += dst_strd;
1620
1621                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32);
1622                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1623                    vst1_u8(pu1_dst_tmp_1, vext_t);
1624                    pu1_dst_tmp_1 += dst_strd;
1625
1626                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40);
1627                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1628                    vst1_u8(pu1_dst_tmp_1, vext_t);
1629                    pu1_dst_tmp_1 += dst_strd;
1630
1631                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48);
1632                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1633                    vst1_u8(pu1_dst_tmp_1, vext_t);
1634                    pu1_dst_tmp_1 += dst_strd;
1635
1636                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56);
1637                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1638                    vst1_u8(pu1_dst_tmp_1, vext_t);
1639                    pu1_dst_tmp_1 += dst_strd;
1640
1641                    pu1_ref_tmp_1 -= 8;
1642                }
1643
1644                /* loop to store next sets of eight values in the destination */
1645
1646                for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8)
1647                {
1648                    pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3);
1649
1650                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1651                    pu1_dst_tmp_2 += dst_strd;
1652
1653                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1654                    pu1_dst_tmp_2 += dst_strd;
1655
1656                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1657                    pu1_dst_tmp_2 += dst_strd;
1658
1659                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1660                    pu1_dst_tmp_2 += dst_strd;
1661
1662                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1663                    pu1_dst_tmp_2 += dst_strd;
1664
1665                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1666                    pu1_dst_tmp_2 += dst_strd;
1667
1668                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1669                    pu1_dst_tmp_2 += dst_strd;
1670
1671                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1672                    pu1_dst_tmp_2 += dst_strd;
1673                }
1674                pu1_ref_tmp_3 += 8;
1675                pu1_dst_tmp_3 += 8;
1676                pu1_dst_tmp_2 = pu1_dst_tmp_3;
1677                cond = 1;
1678            }
1679        }
1680    }
1681}
1682/* INTRA_PRED_LUMA_VER */
1683
1684/**
1685*******************************************************************************
1686*
1687* @brief
1688*    Intra prediction interpolation filter for luma mode2.
1689*
1690* @par Description:
1691*    Intraprediction for mode 2 (sw angle) with reference  neighboring samples
1692*    location pointed by 'pu1_ref' to the  TU block location pointed by
1693*    'pu1_dst'
1694*
1695* @param[in] pu1_src
1696*  UWORD8 pointer to the source
1697*
1698* @param[out] pu1_dst
1699*  UWORD8 pointer to the destination
1700*
1701* @param[in] src_strd
1702*  integer source stride
1703*
1704* @param[in] dst_strd
1705*  integer destination stride
1706*
1707* @param[in] nt
1708*  integer Transform Block size
1709*
1710* @param[in] wd
1711*  integer width of the array
1712*
1713* @returns
1714*
1715* @remarks
1716*  None
1717*
1718*******************************************************************************
1719*/
1720
1721void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref,
1722                                          WORD32 src_strd,
1723                                          UWORD8 *pu1_dst,
1724                                          WORD32 dst_strd,
1725                                          WORD32 nt,
1726                                          WORD32 mode)
1727{
1728
1729    WORD32 row, col;
1730    WORD32 two_nt;
1731    UNUSED(src_strd);
1732    UNUSED(mode);
1733
1734    /* rev_res naming has been made to have the reverse result value in it                              */
1735    /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1736    /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
1737
1738    if(0 != (nt & 7))
1739    {
1740        UWORD8 *pu1_ref_tmp = pu1_ref;
1741        UWORD8 *pu1_dst_tmp = pu1_dst;
1742        uint8x8_t pu1_src_val, rev_res;
1743        uint64x1_t shift_res;
1744
1745        for(col = nt; col > 0; col -= 4)
1746        {
1747            for(row = nt; row > 0; row -= 4)
1748            {
1749                /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */
1750
1751                pu1_src_val = vld1_u8(pu1_ref_tmp);
1752                shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8);
1753                rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res));
1754
1755                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0);
1756                pu1_dst_tmp += dst_strd;
1757
1758                shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8);
1759                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1760                pu1_dst_tmp += dst_strd;
1761
1762                shift_res = vshr_n_u64(shift_res, 8);
1763                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1764                pu1_dst_tmp += dst_strd;
1765
1766                shift_res = vshr_n_u64(shift_res, 8);
1767                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1768                pu1_dst_tmp += dst_strd;
1769            }
1770        }
1771    }
1772
1773    /* rev_val_second, rev_val_first  to reverse the loaded values in order to get the values in right order */
1774    /* shift_64 to shift the reversed 2nd values to get the value what we need                               */
1775    /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
1776
1777    else
1778    {
1779        UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref;
1780        UWORD8 *pu1_dst_tmp = pu1_dst;
1781        UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1782
1783        uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first;
1784        uint64x1_t shift_val;
1785
1786        two_nt = 2 * nt;
1787        pu1_ref_two_nt_minus2 += (two_nt);
1788        pu1_ref_two_nt_minus2 -= 8;
1789
1790        for(col = nt; col > 0; col -= 8)
1791        {
1792            for(row = nt; row > 0; row -= 8)
1793            {
1794                pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2);
1795                rev_val_first = vrev64_u8(pu1_src_val2);
1796
1797                pu1_ref_two_nt_minus2 -= 8;
1798                pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2);
1799                rev_val_second = vrev64_u8(pu1_src_val1);
1800
1801                vext_t = vext_u8(rev_val_first, rev_val_second, 1);
1802                vst1_u8(pu1_dst_tmp, vext_t);
1803                pu1_dst_tmp += dst_strd;
1804
1805                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8);
1806                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1807                vst1_u8(pu1_dst_tmp, vext_t);
1808                pu1_dst_tmp += dst_strd;
1809
1810                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16);
1811                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1812                vst1_u8(pu1_dst_tmp, vext_t);
1813                pu1_dst_tmp += dst_strd;
1814
1815                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24);
1816                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1817                vst1_u8(pu1_dst_tmp, vext_t);
1818                pu1_dst_tmp += dst_strd;
1819
1820                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32);
1821                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1822                vst1_u8(pu1_dst_tmp, vext_t);
1823                pu1_dst_tmp += dst_strd;
1824
1825                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40);
1826                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1827                vst1_u8(pu1_dst_tmp, vext_t);
1828                pu1_dst_tmp += dst_strd;
1829
1830                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48);
1831                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1832                vst1_u8(pu1_dst_tmp, vext_t);
1833                pu1_dst_tmp += dst_strd;
1834
1835                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56);
1836                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1837                vst1_u8(pu1_dst_tmp, vext_t);
1838                pu1_dst_tmp += dst_strd;
1839            }
1840            pu1_dst_tmp_plus8 += 8;
1841            pu1_dst_tmp = pu1_dst_tmp_plus8;
1842            pu1_ref_two_nt_minus2 += (nt - 8);
1843        }
1844    }
1845}
1846/* INTRA_PRED_LUMA_MODE2 */
1847
1848/**
1849*******************************************************************************
1850*
1851* @brief
1852*   Intra prediction interpolation filter for luma mode 18 & mode 34.
1853*
1854* @par Description:
1855*    Intraprediction for mode 34 (ne angle) with reference  neighboring
1856*    samples location pointed by 'pu1_ref' to the  TU block location pointed by
1857*    'pu1_dst'
1858*
1859* @param[in] pu1_src
1860*  UWORD8 pointer to the source
1861*
1862* @param[out] pu1_dst
1863*  UWORD8 pointer to the destination
1864*
1865* @param[in] src_strd
1866*  integer source stride
1867*
1868* @param[in] dst_strd
1869*  integer destination stride
1870*
1871* @param[in] nt
1872*  integer Transform Block size
1873*
1874* @param[in] wd
1875*  integer width of the array
1876*
1877* @returns
1878*
1879* @remarks
1880*  None
1881*
1882*******************************************************************************
1883*/
1884
1885void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref,
1886                                               WORD32 src_strd,
1887                                               UWORD8 *pu1_dst,
1888                                               WORD32 dst_strd,
1889                                               WORD32 nt,
1890                                               WORD32 mode)
1891{
1892
1893    WORD32 row, col, idx;
1894    WORD32 intraPredAngle = 32;
1895    WORD32 two_nt;
1896    UNUSED(src_strd);
1897    two_nt = 2 * nt;
1898
1899    UWORD8 *pu1_ref_tmp = pu1_ref;
1900    UWORD8 *pu1_ref_tmp1 = pu1_ref;
1901    UWORD8 *pu1_dst_tmp = pu1_dst;
1902    UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1903
1904    uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7;
1905
1906    /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref)   */
1907    /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue        */
1908    /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8      */
1909    /* rows and columns are unrolled by 8, when the width is multiple of 8                                  */
1910    /* loops are maintained separately for mode18 and mode34                                                */
1911
1912    /* cond to allow multiples of 8 */
1913    if(0 == (nt & 7))
1914    {
1915        if(mode == 34)
1916        {
1917            pu1_ref_tmp += (two_nt + 2);
1918
1919            for(row = nt; row > 0; row -= 8)
1920            {
1921                for(col = nt; col > 0; col -= 8)
1922                {
1923                    /* Loading 1st eight values */
1924                    src_tmp_1st = vld1_u8(pu1_ref_tmp);
1925                    pu1_ref_tmp += 8;
1926
1927                    /* Loading next eight values */
1928                    src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1929
1930                    /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1931                    vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1);
1932                    vst1_u8(pu1_dst_tmp, src_tmp_1st);
1933                    pu1_dst_tmp += dst_strd;
1934
1935                    vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2);
1936                    vst1_u8(pu1_dst_tmp, vext1);
1937                    pu1_dst_tmp += dst_strd;
1938
1939                    vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3);
1940                    vst1_u8(pu1_dst_tmp, vext2);
1941                    pu1_dst_tmp += dst_strd;
1942
1943                    vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4);
1944                    vst1_u8(pu1_dst_tmp, vext3);
1945                    pu1_dst_tmp += dst_strd;
1946
1947                    vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5);
1948                    vst1_u8(pu1_dst_tmp, vext4);
1949                    pu1_dst_tmp += dst_strd;
1950
1951                    vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6);
1952                    vst1_u8(pu1_dst_tmp, vext5);
1953                    pu1_dst_tmp += dst_strd;
1954
1955                    vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7);
1956                    vst1_u8(pu1_dst_tmp, vext6);
1957                    pu1_dst_tmp += dst_strd;
1958
1959                    vst1_u8(pu1_dst_tmp, vext7);
1960                    pu1_dst_tmp += dst_strd;
1961                }
1962
1963                pu1_dst_tmp_plus8 += 8;
1964                pu1_dst_tmp = pu1_dst_tmp_plus8;
1965                pu1_ref_tmp -= (nt - 8);
1966            }
1967        }
1968        else /* Loop for mode 18 */
1969        {
1970            pu1_ref_tmp += (two_nt);
1971
1972            for(row = nt; row > 0; row -= 8)
1973            {
1974                for(col = nt; col > 0; col -= 8)
1975                {
1976                    /* Loading 1st eight values */
1977                    src_tmp_1st = vld1_u8(pu1_ref_tmp);
1978                    pu1_ref_tmp -= 8;
1979
1980                    /* Loading next eight values */
1981                    src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1982
1983                    /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1984                    vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7);
1985                    vst1_u8(pu1_dst_tmp, src_tmp_1st);
1986                    pu1_dst_tmp += dst_strd;
1987
1988                    vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6);
1989                    vst1_u8(pu1_dst_tmp, vext1);
1990                    pu1_dst_tmp += dst_strd;
1991
1992                    vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5);
1993                    vst1_u8(pu1_dst_tmp, vext2);
1994                    pu1_dst_tmp += dst_strd;
1995
1996                    vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4);
1997                    vst1_u8(pu1_dst_tmp, vext3);
1998                    pu1_dst_tmp += dst_strd;
1999
2000                    vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3);
2001                    vst1_u8(pu1_dst_tmp, vext4);
2002                    pu1_dst_tmp += dst_strd;
2003
2004                    vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2);
2005                    vst1_u8(pu1_dst_tmp, vext5);
2006                    pu1_dst_tmp += dst_strd;
2007
2008                    vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1);
2009                    vst1_u8(pu1_dst_tmp, vext6);
2010                    pu1_dst_tmp += dst_strd;
2011
2012                    vst1_u8(pu1_dst_tmp, vext7);
2013                    pu1_dst_tmp += dst_strd;
2014                }
2015                pu1_dst_tmp_plus8 += 8;
2016                pu1_dst_tmp = pu1_dst_tmp_plus8;
2017                pu1_ref_tmp += (nt + 8);
2018            }
2019        }
2020    }
2021
2022    /* rows and columns are unrolled by 4, when the width is multiple of 4  */
2023
2024    else /* loop for multiples of 4 */
2025    {
2026        uint8x8_t src_val1;
2027        uint8x8_t src_val2;
2028
2029        if(mode == 18)
2030            intraPredAngle = -32;
2031        else if(mode == 34)
2032            intraPredAngle = 32;
2033
2034        for(row = 0; row < nt; row += 2)
2035        {
2036            /* unrolling 2 rows */
2037            idx = ((row + 1) * intraPredAngle) >> 5;
2038            pu1_ref_tmp = pu1_ref + two_nt + idx + 1;
2039            src_val1 = vld1_u8(pu1_ref_tmp);
2040
2041            idx = ((row + 2) * intraPredAngle) >> 5;
2042            pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1;
2043            src_val2 = vld1_u8(pu1_ref_tmp1);
2044
2045            /* unrolling 4 col */
2046            for(col = nt; col > 0; col -= 4)
2047            {
2048                pu1_dst_tmp = pu1_dst;
2049                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0);
2050                pu1_dst_tmp += dst_strd;
2051                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0);
2052                pu1_dst += 4;
2053            }
2054            pu1_dst += 2 * dst_strd - nt;
2055        }
2056    }
2057}
2058/* INTRA_PRED_LUMA_MODE_18_34 */
2059
2060/**
2061 *******************************************************************************
2062 *
2063 * @brief
2064 *    Intra prediction interpolation filter for luma mode 3 to mode 9
2065 *
2066 * @par Description:
2067 *    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
2068 *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2069 *    block location pointed by 'pu1_dst'
2070 *
2071 * @param[in] pu1_src
2072 *  UWORD8 pointer to the source
2073 *
2074 * @param[out] pu1_dst
2075 *  UWORD8 pointer to the destination
2076 *
2077 * @param[in] src_strd
2078 *  integer source stride
2079 *
2080 * @param[in] dst_strd
2081 *  integer destination stride
2082 *
2083 * @param[in] nt
2084 *  integer Transform Block size
2085 *
2086 * @param[in] mode
2087 *  integer intraprediction mode
2088 *
2089 * @returns
2090 *
2091 * @remarks
2092 *  None
2093 *
2094 *******************************************************************************
2095 */
2096
2097
2098void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref,
2099                                                WORD32 src_strd,
2100                                                UWORD8 *pu1_dst,
2101                                                WORD32 dst_strd,
2102                                                WORD32 nt,
2103                                                WORD32 mode)
2104{
2105
2106    WORD32 row, col;
2107    WORD32 intra_pred_ang;
2108    WORD32 pos, fract = 100, fract_prev;
2109    UNUSED(src_strd);
2110    if(0 == (nt & 7))
2111    {
2112
2113        UWORD8 *pu1_ref_main_idx = pu1_ref;
2114        UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2115
2116        UWORD8 *pu1_dst_tmp1 = pu1_dst;
2117        UWORD8 *pu1_dst_tmp2 = pu1_dst;
2118
2119        WORD32 two_nt = 2 * nt;
2120
2121        pu1_ref_main_idx += two_nt;
2122        pu1_ref_main_idx_1 += two_nt - 1;
2123
2124        uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2125        uint8x8_t shift_res;
2126        uint16x8_t mul_res1, mul_res2, add_res;
2127
2128        /* Intra Pred Angle according to the mode */
2129        intra_pred_ang = gai4_ihevc_ang_table[mode];
2130
2131        pu1_ref_main_idx -= 8;
2132        pu1_ref_main_idx_1 -= 8;
2133
2134        for(col = 0; col < nt; col++)
2135        {
2136            fract_prev = fract;
2137
2138            pos = ((col + 1) * intra_pred_ang);
2139            fract = pos & (31);
2140
2141            if(fract_prev < fract)
2142            {
2143                pu1_ref_main_idx += 1;
2144                pu1_ref_main_idx_1 += 1;
2145            }
2146
2147            dup_const_fract = vdup_n_u8((uint8_t)fract);
2148            dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2149
2150            for(row = nt; row > 0; row -= 8)
2151            {
2152                ref_main_idx = vld1_u8(pu1_ref_main_idx);
2153                ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2154
2155                mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2156                mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2157
2158                add_res = vaddq_u16(mul_res1, mul_res2);
2159
2160                shift_res = vrshrn_n_u16(add_res, 5);
2161
2162                vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2163                pu1_dst_tmp1 += dst_strd;
2164
2165                vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2166                pu1_dst_tmp1 += dst_strd;
2167
2168                vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2169                pu1_dst_tmp1 += dst_strd;
2170
2171                vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2172                pu1_dst_tmp1 += dst_strd;
2173
2174                vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2175                pu1_dst_tmp1 += dst_strd;
2176
2177                vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2178                pu1_dst_tmp1 += dst_strd;
2179
2180                vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2181                pu1_dst_tmp1 += dst_strd;
2182
2183                vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2184                pu1_dst_tmp1 += dst_strd;
2185
2186                pu1_ref_main_idx -= 8;
2187                pu1_ref_main_idx_1 -= 8;
2188
2189            }
2190            pu1_dst_tmp2 += 1;
2191            pu1_dst_tmp1 = pu1_dst_tmp2;
2192
2193            pu1_ref_main_idx += nt;
2194            pu1_ref_main_idx_1 += nt;
2195
2196            pu1_ref_main_idx -= 1;
2197            pu1_ref_main_idx_1 -= 1;
2198
2199        }
2200    }
2201    else
2202    {
2203        UWORD8 *pu1_ref_tmp1 = pu1_ref;
2204        UWORD8 *pu1_ref_tmp2 = pu1_ref;
2205        UWORD8 *pu1_dst_tmp1 = pu1_dst;
2206        UWORD8 *pu1_dst_tmp2 = pu1_dst;
2207
2208        pu1_ref_tmp1 += nt;
2209        pu1_ref_tmp2 += (nt - 1);
2210
2211        uint8x8_t dup_fract, dup_32_fract, shift_res;
2212        uint16x8_t mul_res1, mul_res2, add_res;
2213        uint32x2_t  pu1_ref_val1, pu1_ref_val2;
2214
2215        pu1_ref_val1 = vdup_n_u32(0);
2216        pu1_ref_val2 = vdup_n_u32(0);
2217
2218        /* Intra Pred Angle according to the mode */
2219        intra_pred_ang = gai4_ihevc_ang_table[mode];
2220
2221
2222        for(col = 0; col < nt; col++)
2223        {
2224            fract_prev = fract;
2225            pos = ((col + 1) * intra_pred_ang);
2226            fract = pos & (31);
2227            if(fract_prev < fract)
2228            {
2229                pu1_ref_tmp1 += 1;
2230                pu1_ref_tmp2 += 1;
2231            }
2232            dup_fract = vdup_n_u8((uint8_t)fract);
2233            dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2234
2235            for(row = nt; row > 0; row -= 4)
2236            {
2237                pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2238                pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2239
2240                mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2241                mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2242
2243                add_res = vaddq_u16(mul_res1, mul_res2);
2244
2245                shift_res = vrshrn_n_u16(add_res, 5);
2246
2247                vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2248                pu1_dst_tmp1 += dst_strd;
2249
2250                vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2251                pu1_dst_tmp1 += dst_strd;
2252
2253                vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2254                pu1_dst_tmp1 += dst_strd;
2255
2256                vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2257
2258            }
2259            pu1_ref_tmp1 -= 1;
2260            pu1_ref_tmp2 -= 1;
2261
2262            pu1_dst_tmp2 += 1;
2263            pu1_dst_tmp1 = pu1_dst_tmp2;
2264
2265        }
2266
2267
2268    }
2269
2270}
2271
2272/**
2273 *******************************************************************************
2274 *
2275 * @brief
2276 *   Intra prediction interpolation filter for luma mode 11 to mode 17
2277 *
2278 * @par Description:
2279 *    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
2280 *    with reference  neighboring samples location pointed by 'pu1_ref' to the
2281 *    TU block location pointed by 'pu1_dst'
2282 *
2283 * @param[in] pu1_src
2284 *  UWORD8 pointer to the source
2285 *
2286 * @param[out] pu1_dst
2287 *  UWORD8 pointer to the destination
2288 *
2289 * @param[in] src_strd
2290 *  integer source stride
2291 *
2292 * @param[in] dst_strd
2293 *  integer destination stride
2294 *
2295 * @param[in] nt
2296 *  integer Transform Block size
2297 *
2298 * @param[in] mode
2299 *  integer intraprediction mode
2300 *
2301 * @returns
2302 *
2303 * @remarks
2304 *  None
2305 *
2306 *******************************************************************************
2307 */
2308
2309
2310void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref,
2311                                                  WORD32 src_strd,
2312                                                  UWORD8 *pu1_dst,
2313                                                  WORD32 dst_strd,
2314                                                  WORD32 nt,
2315                                                  WORD32 mode)
2316{
2317
2318    WORD32 row, col, k;
2319    WORD32 two_nt;
2320    WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
2321    WORD32 pos, fract = 1000, fract_prev;
2322    WORD32  ref_idx;
2323
2324    UWORD8 *ref_main;
2325    UWORD8 *ref_main_tmp;
2326
2327    UWORD8 *pu1_ref_tmp1 = pu1_ref;
2328    UWORD8 *pu1_ref_tmp2 = pu1_ref;
2329    UWORD8 *pu1_dst_tmp1 = pu1_dst;
2330    UWORD8 *pu1_dst_tmp2 = pu1_dst;
2331
2332    UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
2333
2334    uint16x8_t mul_res1, mul_res2, add_res;
2335    uint8x8_t dup_const_fract, dup_const_32_fract;
2336    uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2337    uint8x8_t ref_left_t;
2338    uint32x2_t  ref_left_tmp;
2339    UNUSED(src_strd);
2340    ref_left_tmp = vdup_n_u32(0);
2341
2342    inv_ang_sum = 128;
2343    two_nt = 2 * nt;
2344
2345    intra_pred_ang = gai4_ihevc_ang_table[mode];
2346
2347    inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
2348
2349    pu1_ref_tmp1 += two_nt;
2350
2351    ref_main = ref_temp + (nt - 1);
2352    ref_main_tmp = ref_main;
2353
2354    if(0 == (nt & 7))
2355    {
2356        pu1_ref_tmp2 += (two_nt - 7);
2357
2358        for(k = nt - 1; k >= 0; k -= 8)
2359        {
2360
2361            ref_left_t = vld1_u8(pu1_ref_tmp2);
2362
2363            ref_left_t = vrev64_u8(ref_left_t);
2364            vst1_u8(ref_main_tmp, ref_left_t);
2365            ref_main_tmp += 8;
2366            pu1_ref_tmp2 -= 8;
2367
2368        }
2369
2370    }
2371    else
2372    {
2373        uint8x8_t rev_val;
2374        pu1_ref_tmp2 += (two_nt - (nt - 1));
2375
2376        for(k = nt - 1; k >= 0; k -= 8)
2377        {
2378
2379            ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1);
2380
2381            rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp));
2382            vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0);
2383
2384        }
2385
2386    }
2387
2388    ref_main[nt] = pu1_ref[two_nt - nt];
2389
2390    /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
2391
2392    ref_idx = (nt * intra_pred_ang) >> 5;
2393
2394    /* SIMD Optimization can be done using look-up table for the loop */
2395    /* For negative angled derive the main reference samples from side */
2396    /*  reference samples refer to section 8.4.4.2.6 */
2397    for(k = -1; k > ref_idx; k--)
2398    {
2399        inv_ang_sum += inv_ang;
2400        ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
2401    }
2402
2403    UWORD8 *ref_main_tmp1 = ref_main;
2404    UWORD8 *ref_main_tmp2 = ref_main;
2405
2406    ref_main_tmp2 += 1;
2407
2408    if(0 == (nt & 7))
2409    {
2410        /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2411        /* samples dependent on distance to obtain destination sample */
2412        for(col = 0; col < nt; col++)
2413        {
2414
2415            fract_prev = fract;
2416            pos = ((col + 1) * intra_pred_ang);
2417            fract = pos & (31);
2418
2419            if(fract_prev < fract)
2420            {
2421                ref_main_tmp1 -= 1;
2422                ref_main_tmp2 -= 1;
2423            }
2424
2425            dup_const_fract = vdup_n_u8((uint8_t)fract);
2426            dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2427
2428            // Do linear filtering
2429            for(row = nt; row > 0; row -= 8)
2430            {
2431                ref_main_idx = vld1_u8(ref_main_tmp1);
2432
2433                ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2434
2435                mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2436                mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2437
2438                add_res = vaddq_u16(mul_res1, mul_res2);
2439
2440                shift_res = vrshrn_n_u16(add_res, 5);
2441
2442                vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2443                pu1_dst_tmp1 += dst_strd;
2444
2445                vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2446                pu1_dst_tmp1 += dst_strd;
2447
2448                vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2449                pu1_dst_tmp1 += dst_strd;
2450
2451                vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2452                pu1_dst_tmp1 += dst_strd;
2453
2454                vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2455                pu1_dst_tmp1 += dst_strd;
2456
2457                vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2458                pu1_dst_tmp1 += dst_strd;
2459
2460                vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2461                pu1_dst_tmp1 += dst_strd;
2462
2463                vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2464                pu1_dst_tmp1 += dst_strd;
2465
2466                ref_main_tmp1 += 8;
2467                ref_main_tmp2 += 8;
2468            }
2469
2470            ref_main_tmp1 -= nt;
2471            ref_main_tmp2 -= nt;
2472
2473            pu1_dst_tmp2 += 1;
2474            pu1_dst_tmp1 = pu1_dst_tmp2;
2475        }
2476    }
2477    else
2478    {
2479        uint32x2_t ref_main_idx1, ref_main_idx2;
2480
2481        ref_main_idx1 = vdup_n_u32(0);
2482        ref_main_idx2 = vdup_n_u32(0);
2483
2484        for(col = 0; col < nt; col++)
2485        {
2486            fract_prev = fract;
2487            pos = ((col + 1) * intra_pred_ang);
2488            fract = pos & (31);
2489
2490            if(fract_prev < fract)
2491            {
2492                ref_main_tmp1 -= 1;
2493                ref_main_tmp2 -= 1;
2494            }
2495
2496            dup_const_fract = vdup_n_u8((uint8_t)fract);
2497            dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2498
2499            for(row = nt; row > 0; row -= 4)
2500            {
2501
2502                ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2503                ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2504
2505                mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2506                mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2507
2508                add_res = vaddq_u16(mul_res1, mul_res2);
2509
2510                shift_res = vrshrn_n_u16(add_res, 5);
2511
2512                vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2513                pu1_dst_tmp1 += dst_strd;
2514
2515                vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2516                pu1_dst_tmp1 += dst_strd;
2517
2518                vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2519                pu1_dst_tmp1 += dst_strd;
2520
2521                vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2522                pu1_dst_tmp1 += dst_strd;
2523
2524            }
2525
2526            pu1_dst_tmp2 += 1;
2527            pu1_dst_tmp1 = pu1_dst_tmp2;
2528
2529        }
2530
2531    }
2532}
2533
2534/**
2535 *******************************************************************************
2536 *
2537 * @brief
2538 *   Intra prediction interpolation filter for luma mode 19 to mode 25
2539 *
2540 * @par Description:
2541 *    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
2542 *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2543 *    block location pointed by 'pu1_dst'
2544 *
2545 * @param[in] pu1_src
2546 *  UWORD8 pointer to the source
2547 *
2548 * @param[out] pu1_dst
2549 *  UWORD8 pointer to the destination
2550 *
2551 * @param[in] src_strd
2552 *  integer source stride
2553 *
2554 * @param[in] dst_strd
2555 *  integer destination stride
2556 *
2557 * @param[in] nt
2558 *  integer Transform Block size
2559 *
2560 * @param[in] mode
2561 *  integer intraprediction mode
2562 *
2563 * @returns
2564 *
2565 * @remarks
2566 *  None
2567 *
2568 *******************************************************************************
2569 */
2570
2571
2572void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref,
2573                                                  WORD32 src_strd,
2574                                                  UWORD8 *pu1_dst,
2575                                                  WORD32 dst_strd,
2576                                                  WORD32 nt,
2577                                                  WORD32 mode)
2578{
2579
2580    WORD32 row, col, k;
2581    WORD32 two_nt, intra_pred_ang;
2582    WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;;
2583    WORD32 ref_idx;
2584    UWORD8 *ref_main;
2585    UWORD8 *ref_main_tmp;
2586    UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
2587
2588    UWORD8 *pu1_ref_tmp1 = pu1_ref;
2589    UWORD8 *pu1_ref_tmp2 = pu1_ref;
2590    UWORD8 *pu1_dst_tmp1 = pu1_dst;
2591
2592    uint16x8_t mul_res1, mul_res2, add_res;
2593    uint8x8_t dup_const_fract, dup_const_32_fract;
2594    uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2595    uint8x8_t ref_above_t;
2596    uint32x2_t ref_above_tmp;
2597    UNUSED(src_strd);
2598    ref_above_tmp = vdup_n_u32(0);
2599
2600    two_nt = 2 * nt;
2601    intra_pred_ang = gai4_ihevc_ang_table[mode];
2602    inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
2603
2604    /* Intermediate reference samples for negative angle modes */
2605    /* This have to be removed during optimization*/
2606    pu1_ref_tmp1 += two_nt;
2607
2608
2609    ref_main = ref_temp + (nt - 1);
2610    ref_main_tmp = ref_main;
2611
2612    if(0 == (nt & 7))
2613    {
2614        pu1_ref_tmp2 += (two_nt - 7);
2615        for(k = nt - 1; k >= 0; k -= 8)
2616        {
2617
2618            ref_above_t = vld1_u8(pu1_ref_tmp1);
2619            vst1_u8(ref_main_tmp, ref_above_t);
2620            ref_main_tmp += 8;
2621            pu1_ref_tmp1 += 8;
2622
2623        }
2624
2625    }
2626    else
2627    {
2628        pu1_ref_tmp2 += (two_nt - (nt - 1));
2629
2630        for(k = nt - 1; k >= 0; k -= 4)
2631        {
2632
2633            ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0);
2634            vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0);
2635
2636        }
2637
2638    }
2639
2640    ref_main[nt] = pu1_ref[two_nt + nt];
2641
2642    /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
2643
2644    ref_idx = (nt * intra_pred_ang) >> 5;
2645    inv_ang_sum = 128;
2646
2647    /* SIMD Optimization can be done using look-up table for the loop */
2648    /* For negative angled derive the main reference samples from side */
2649    /*  reference samples refer to section 8.4.4.2.6 */
2650    for(k = -1; k > ref_idx; k--)
2651    {
2652        inv_ang_sum += inv_ang;
2653        ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
2654    }
2655
2656    UWORD8 *ref_main_tmp1 = ref_main;
2657    UWORD8 *ref_main_tmp2 = ref_main;
2658
2659    ref_main_tmp2 += 1;
2660
2661    if(0 == (nt & 7))
2662    {
2663        /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2664        /* samples dependent on distance to obtain destination sample */
2665        for(row = 0; row < nt; row++)
2666        {
2667
2668            fract_prev = fract;
2669            pos = ((row + 1) * intra_pred_ang);
2670            fract = pos & (31);
2671
2672            if(fract_prev < fract)
2673            {
2674                ref_main_tmp1 -= 1;
2675                ref_main_tmp2 -= 1;
2676            }
2677
2678            dup_const_fract = vdup_n_u8((uint8_t)fract);
2679            dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2680
2681            // Do linear filtering
2682            for(col = nt; col > 0; col -= 8)
2683            {
2684                ref_main_idx = vld1_u8(ref_main_tmp1);
2685
2686                ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2687
2688                mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2689                mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2690
2691                add_res = vaddq_u16(mul_res1, mul_res2);
2692
2693                shift_res = vrshrn_n_u16(add_res, 5);
2694
2695                vst1_u8(pu1_dst_tmp1, shift_res);
2696                pu1_dst_tmp1 += 8;
2697
2698                ref_main_tmp1 += 8;
2699                ref_main_tmp2 += 8;
2700            }
2701
2702            ref_main_tmp1 -= nt;
2703            ref_main_tmp2 -= nt;
2704
2705            pu1_dst_tmp1 += (dst_strd - nt);
2706        }
2707    }
2708    else
2709    {
2710        uint32x2_t ref_main_idx1, ref_main_idx2;
2711
2712        ref_main_idx1 = vdup_n_u32(0);
2713        ref_main_idx2 = vdup_n_u32(0);
2714
2715        for(row = 0; row < nt; row++)
2716        {
2717            fract_prev = fract;
2718            pos = ((row + 1) * intra_pred_ang);
2719            fract = pos & (31);
2720
2721            if(fract_prev < fract)
2722            {
2723                ref_main_tmp1 -= 1;
2724                ref_main_tmp2 -= 1;
2725            }
2726
2727            dup_const_fract = vdup_n_u8((uint8_t)fract);
2728            dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2729
2730            for(col = nt; col > 0; col -= 4)
2731            {
2732
2733                ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2734                ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2735
2736                mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2737                mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2738
2739                add_res = vaddq_u16(mul_res1, mul_res2);
2740
2741                shift_res = vrshrn_n_u16(add_res, 5);
2742
2743                vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2744                pu1_dst_tmp1 += 4;
2745
2746            }
2747            pu1_dst_tmp1 += (dst_strd - nt);
2748        }
2749
2750    }
2751
2752}
2753
2754/**
2755 *******************************************************************************
2756 *
2757 * @brief
2758 *    Intra prediction interpolation filter for luma mode 27 to mode 33
2759 *
2760 * @par Description:
2761 *    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
2762 *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2763 *    block location pointed by 'pu1_dst'
2764 *
2765 * @param[in] pu1_src
2766 *  UWORD8 pointer to the source
2767 *
2768 * @param[out] pu1_dst
2769 *  UWORD8 pointer to the destination
2770 *
2771 * @param[in] src_strd
2772 *  integer source stride
2773 *
2774 * @param[in] dst_strd
2775 *  integer destination stride
2776 *
2777 * @param[in] nt
2778 *  integer Transform Block size
2779 *
2780 * @param[in] mode
2781 *  integer intraprediction mode
2782 *
2783 * @returns
2784 *
2785 * @remarks
2786 *  None
2787 *
2788 *******************************************************************************
2789 */
2790
2791
2792void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref,
2793                                                  WORD32 src_strd,
2794                                                  UWORD8 *pu1_dst,
2795                                                  WORD32 dst_strd,
2796                                                  WORD32 nt,
2797                                                  WORD32 mode)
2798{
2799
2800    WORD32 row, col;
2801    WORD32 intra_pred_ang;
2802    WORD32 pos, fract = 0, fract_prev;
2803
2804    WORD32 two_nt = 2 * nt;
2805    UNUSED(src_strd);
2806    if(0 == (nt & 7))
2807    {
2808
2809        UWORD8 *pu1_ref_main_idx = pu1_ref;
2810        UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2811
2812        UWORD8 *pu1_dst_tmp1 = pu1_dst;
2813        pu1_ref_main_idx += (two_nt + 1);
2814        pu1_ref_main_idx_1 += (two_nt + 2);
2815
2816        uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2817        uint8x8_t shift_res;
2818        uint16x8_t mul_res1, mul_res2, add_res;
2819
2820        /* Intra Pred Angle according to the mode */
2821        intra_pred_ang = gai4_ihevc_ang_table[mode];
2822
2823        for(row = 0; row < nt; row++)
2824        {
2825            fract_prev = fract;
2826
2827            pos = ((row + 1) * intra_pred_ang);
2828            fract = pos & (31);
2829
2830            if(fract_prev > fract)
2831            {
2832                pu1_ref_main_idx += 1;
2833                pu1_ref_main_idx_1 += 1;
2834            }
2835
2836            dup_const_fract = vdup_n_u8((uint8_t)fract);
2837            dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2838
2839            for(col = nt; col > 0; col -= 8)
2840            {
2841                ref_main_idx = vld1_u8(pu1_ref_main_idx);
2842                ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2843
2844                mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2845                mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2846
2847                add_res = vaddq_u16(mul_res1, mul_res2);
2848
2849                shift_res = vrshrn_n_u16(add_res, 5);
2850
2851                vst1_u8(pu1_dst_tmp1, shift_res);
2852                pu1_dst_tmp1 += 8;
2853
2854                pu1_ref_main_idx += 8;
2855                pu1_ref_main_idx_1 += 8;
2856            }
2857
2858            pu1_ref_main_idx -= nt;
2859            pu1_ref_main_idx_1 -= nt;
2860
2861            pu1_dst_tmp1 += (dst_strd - nt);
2862        }
2863
2864    }
2865    else
2866    {
2867        UWORD8 *pu1_ref_tmp1 = pu1_ref;
2868        UWORD8 *pu1_ref_tmp2 = pu1_ref;
2869        UWORD8 *pu1_dst_tmp1 = pu1_dst;
2870
2871        pu1_ref_tmp1 += (two_nt + 1);;
2872        pu1_ref_tmp2 += (two_nt + 2);;
2873
2874        uint8x8_t dup_fract, dup_32_fract, shift_res;
2875        uint16x8_t mul_res1, mul_res2, add_res;
2876        uint32x2_t  pu1_ref_val1, pu1_ref_val2;
2877
2878        pu1_ref_val1 = vdup_n_u32(0);
2879        pu1_ref_val2 = vdup_n_u32(0);
2880
2881        /* Intra Pred Angle according to the mode */
2882        intra_pred_ang = gai4_ihevc_ang_table[mode];
2883
2884        for(row = 0; row < nt; row++)
2885        {
2886            fract_prev = fract;
2887            pos = ((row + 1) * intra_pred_ang);
2888            fract = pos & (31);
2889            if(fract_prev > fract)
2890            {
2891                pu1_ref_tmp1 += 1;
2892                pu1_ref_tmp2 += 1;
2893            }
2894            dup_fract = vdup_n_u8((uint8_t)fract);
2895            dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2896
2897            for(col = nt; col > 0; col -= 4)
2898            {
2899                pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2900                pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2901
2902                mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2903                mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2904
2905                add_res = vaddq_u16(mul_res1, mul_res2);
2906
2907                shift_res = vrshrn_n_u16(add_res, 5);
2908
2909                vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2910                pu1_dst_tmp1 += 4;
2911
2912            }
2913
2914            pu1_dst_tmp1 += (dst_strd - nt);
2915
2916        }
2917
2918
2919    }
2920
2921}
2922