10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  ihevc_intra_pred_chroma_mode_3_to_9.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  contains function definitions for intra prediction dc filtering.
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* functions are coded using neon  intrinsics and can be compiled using
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* rvct
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @author
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  parthiban v
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @par list of functions:
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @remarks
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  none
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///**
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @brief
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*    luma intraprediction filter for dc input
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @par description:
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] pu1_ref
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  uword8 pointer to the source
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[out] pu1_dst
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  uword8 pointer to the destination
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] src_strd
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer source stride
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] dst_strd
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer destination stride
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] nt
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  size of tranform block
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] mode
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  type of filtering
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @returns
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @remarks
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  none
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref,
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                       word32 src_strd,
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                       uword8 *pu1_dst,
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                       word32 dst_strd,
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                       word32 nt,
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                       word32 mode)
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//**************variables vs registers*****************************************
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x0 => *pu1_ref
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x1 => src_strd
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x2 => *pu1_dst
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x3 => dst_strd
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//stack contents from #40
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    nt
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    mode
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.align 4
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.include "ihevc_neon_macros.s"
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_intra_pred_chroma_mode_3_to_9_av8
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern gai4_ihevc_ang_table
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern gai4_ihevc_inv_ang_table
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern col_for_intra_chroma
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern idx_neg_idx_chroma_3_9
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_intra_pred_chroma_mode_3_to_9_av8, %function
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_intra_pred_chroma_mode_3_to_9_av8:
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
1079cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
1089cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    stp         d13,d14,[sp,#-16]!
1099cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
1109cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy                                            // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stp         x19, x20,[sp,#-16]!
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    adrp        x7,  :got:gai4_ihevc_ang_table
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    adrp        x8,  :got:gai4_ihevc_inv_ang_table
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         w7,  [x7]                   //intra_pred_ang
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sxtw        x7,w7
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v30.8b,w7                   //intra_pred_ang
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    adrp        x14,  :got:col_for_intra_chroma
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarprologue_8_16_32:
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsr         x10, x4, #3
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v31.8b},[x14],#8
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mul         x10, x4, x10                //block counter (dec by #8)
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         x11, x4, #1                 //col counter to be inc/dec by #8
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x7, x5, #3
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    adrp        x12,  :got:idx_neg_idx_chroma_3_9 //load most idx table
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         x12, [x12,  #:got_lo12:idx_neg_idx_chroma_3_9]
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x12, x12, x7, lsl #4
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x8, x12
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x7, #8
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x7, x7, x3, lsl #3          //x7 = 8-8x3
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         w9,  [x8]
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sxtw        x9,w9
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         x9, x9, #1
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x1, x0, x4, lsl #2          //pu1_ref + 4*nt
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v6.8b,  v22.8h
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v26.8b,w9                   //most idx added to final idx values
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x1, x1, #26                 //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x6, x1, x9
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.16b, v1.16b}, [x6]      //stores the 32 values reqd based on indices values (from most idx)
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sshr        v22.8h, v22.8h,#5
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v29.8b, #31                 //contains #31 for vand operation
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v28.8b, #32
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1639cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sqxtn       v2.8b,  v22.8h
1649cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shl         v2.8b, v2.8b,#1             // 2 * idx
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x0,#0x302                   // idx value for v is +1 of u
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v27.4h,w0
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x0,#0
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1739cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    movi        v3.8b, #22                  //row 0 to 7
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1759cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v2.8b,  v2.8b ,  v27.8b     //ref_main_idx (sub row)
1769cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v2.8b,  v26.8b ,  v2.8b     //ref_main_idx (row 0)
1779cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    add         v2.8b,  v2.8b ,  v3.8b      //to compensate the pu1_src idx incremented by 8
1789cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v3.8b,  v2.8b ,  v29.8b     //ref_main_idx + 1 (row 0)
1799cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0)
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1829cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0)
1839cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v4.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 1)
1849cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v5.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 1)
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v29.8b, #4
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
1899cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umull       v24.8h, v25.8b, v7.8b       //mul (row 0)
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
1939cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 2)
1949cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 2)
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1989cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2)
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2029cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2)
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 3)
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v24.8b},[x2], x3           //st (row 0)
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2099cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2139cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
2149cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 4)
2159cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 4)
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v22.8b},[x2], x3           //st (row 1)
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2209cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4)
2219cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umull       v18.8h, v19.8b, v7.8b       //mul (row 3)
2229cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v18.8h, v23.8b, v6.8b       //mul (row 3)
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2249cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4)
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 5)
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v20.8b},[x2], x3           //st (row 2)
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
2329cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umull       v24.8h, v25.8b, v7.8b       //mul (row 4)
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
2369cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 6)
2379cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 6)
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v18.8b},[x2], x3           //st (row 3)
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         x4,#4
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_func
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2449cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6)
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2489cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6)
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 7)
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v24.8b},[x2], x3           //st (row 4)
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2559cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2599cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
2609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umull       v18.8h, v19.8b, v7.8b       //mul (row 7)
2619cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v18.8h, v23.8b, v6.8b       //mul (row 7)
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v22.8b},[x2], x3           //st (row 5)
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v18.8b, v18.8h,#5           //round shft (row 7)
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v20.8b},[x2], x3           //st (row 6)
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x10, x10, #4                //subtract 8 and go to end if 8x8
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v18.8b},[x2], x3           //st (row 7)
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_func
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x11, x11, #8                //decrement the processed col
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x8, #4
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x8, x20, x8,gt
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x2, x7
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x2, x20, x2,gt
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x8, x12, x8,le
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x20, x2, x4
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x2, x20, x2,le
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x2, #8
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x2, x20, x2,le
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         x20, x4,  #1
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x11,x20,x11,le
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         lbl284
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    adrp        x14,  :got:col_for_intra_chroma
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarlbl284:
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x0, #8
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x0, x20, x0,le
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v31.8b},[x14],#8
2959cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v25.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
2969cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    xtn         v19.8b,  v25.8h
2979cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshr        v25.8h, v25.8h,#5
2989cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sqxtn       v23.8b,  v25.8h
2999cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shl         v23.8b, v23.8b,#1
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x5, #0x302                  //idx value for v is +1 of u
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v27.4h,w5                   //row value inc or reset accordingly
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         w9,  [x8]                   //loads index value
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sxtw        x9,w9
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         x9, x9, #1
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x5, #22
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x5, x5, x0, lsl #1
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v16.8b,w5
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v26.8b,w9
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x5,x2
3119cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v23.8b,  v23.8b ,  v27.8b   //ref_main_idx (sub row)
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarkernel_8_16_32:
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
3159cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v2.8b,  v26.8b ,  v23.8b    //ref_main_idx
3169cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    mov         v26.8b, v19.8b
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x11, x11, #8
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x6, x1, x9
3209cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
3219cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    add         v2.8b,  v2.8b ,  v16.8b     //to compensate the pu1_src idx incremented by 8
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
3249cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx - 1 (row 7)
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x0, #8
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x0, x20, x0,le
3299cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v3.8b,  v2.8b ,  v29.8b     //ref_main_idx - 2
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x8, #4
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x8, x20, x8,gt
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.16b, v1.16b}, [x6]      //stores the 32 values reqd based on indices values (from most idx)
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         lbl326
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    adrp        x14,  :got:col_for_intra_chroma
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarlbl326:
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v24.8b},[x5], x3           //st (row 4)
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x8, x12, x8,le
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x9,#0x302
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v27.4h,w9                   //row value inc or reset accordingly
3459cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v4.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 1)
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3479cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v5.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 1)
3489cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0)
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v29.8b, #31                 //contains #2 for adding to get ref_main_idx + 1
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3519cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umull       v18.8h, v19.8b, v7.8b       //mul (row 7)
3529cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0)
3539cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v18.8h, v23.8b, v6.8b       //mul (row 7)
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v31.8b},[x14],#8
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         x20, x4,  #1
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x11,x20,x11,le
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v29.8b, #4                  //contains #2 for adding to get ref_main_idx + 1
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         w9,  [x8]
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sxtw        x9,w9
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v22.8b},[x5], x3           //(from previous loop)st (row 5)
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3679cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 2)
3689cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
3699cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 2)
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         x9, x9, #1
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umull       v24.8h, v25.8b, v7.8b       //mul (row 0)
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
3829cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2)
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 3)
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3859cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umull       v22.8h, v19.8b, v7.8b       //mul (row 1)
3869cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2)
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3929cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 4)
3939cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
3949cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 4)
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
3979cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x5,x2,x3,lsl#2
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x9, x9, x0, lsl #1
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v24.8b},[x2], x3           //st (row 0)
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
4089cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4)
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 5)
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4119cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umull       v18.8h, v19.8b, v7.8b       //mul (row 3)
4129cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4)
4139cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v18.8h, v23.8b, v6.8b       //mul (row 3)
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v22.8b},[x2], x3           //st (row 1)
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4189cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    xtn         v19.8b,  v14.8h
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sshr        v14.8h, v14.8h,#5
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4219cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 6)
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tbl         v21.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
4239cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 6)
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4259cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umull       v24.8h, v25.8b, v7.8b       //mul (row 4)
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
4279cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sqxtn       v23.8b,  v14.8h
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v20.8b},[x2], x3           //st (row 2)
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v26.8b,w9
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
4369cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6)
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 7)
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x6, #22                     //to compensate the 2*row value
4409cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shl         v23.8b, v23.8b,#1
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x6, x6, x0, lsl #1
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umull       v22.8h, v21.8b, v7.8b       //mul (row 5)
4449cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6)
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v18.8b},[x2], x3           //st (row 3)
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x2,x2,x3, lsl #2
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v16.8b,w6
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x7, x2
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x2, x20, x2,gt
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x20, x2, x4
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x2, x20, x2,le
4579cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v23.8b,  v23.8b ,  v27.8b   //ref_main_idx (add row)
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x20,x2,#8
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x2, x20, x2,le
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x10, x10, #4                //subtract 8 and go to end if 8x8
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bne         kernel_8_16_32
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepil_8_16_32:
4669cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
4699cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v24.8b},[x5], x3           //st (row 4)
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4759cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umull       v18.8h, v19.8b, v7.8b       //mul (row 7)
4769cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v18.8h, v23.8b, v6.8b       //mul (row 7)
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v18.8b},[x5], x3           //st (row 7)
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_func:
4879cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    // ldmfd sp!,{x4-x12,x15}               //reload the registers from sp
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldp         x19, x20,[sp],#16
4899cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
4909cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy                                            // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
4919cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ldp         d13,d14,[sp],#16
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ret
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
501