10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  ihevc_intra_pred_filters_planar.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  contains function definitions for inter prediction  interpolation.
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* functions are coded using neon  intrinsics and can be compiled using
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* rvct
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @author
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  akshaya mukund
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @par list of functions:
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @remarks
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  none
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///**
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @brief
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*    luma intraprediction filter for planar input
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @par description:
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] pu1_ref
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  uword8 pointer to the source
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[out] pu1_dst
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  uword8 pointer to the destination
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] src_strd
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer source stride
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] dst_strd
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer destination stride
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] pi1_coeff
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  word8 pointer to the planar coefficients
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] nt
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  size of tranform block
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] mode
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  type of filtering
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @returns
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @remarks
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  none
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                  word32 src_strd,
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                  uword8* pu1_dst,
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                  word32 dst_strd,
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                  word32 nt,
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                  word32 mode,
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                   word32 pi1_coeff)
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//**************variables vs registers*****************************************
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x0 => *pu1_ref
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x1 => src_strd
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x2 => *pu1_dst
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x3 => dst_strd
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//stack contents from #40
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    nt
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    mode
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    pi1_coeff
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.align 4
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.include "ihevc_neon_macros.s"
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_intra_pred_luma_planar_av8
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern gau1_ihevc_planar_factor
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern gau1_ihevc_planar_factor_1
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_intra_pred_luma_planar_av8, %function
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_intra_pred_luma_planar_av8:
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
1109cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stp         x19, x20,[sp,#-16]!
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    adrp        x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         x11, [x11, #:got_lo12:gau1_ihevc_planar_factor]
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    clz         w5,w4
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x20, x5, #32
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    neg         x5, x20
1199cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    dup         v29.8h,w5
1209cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    neg         v29.8h, v29.8h              //shr value (so vneg)
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v2.8b,w4                    //nt
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v16.8h,w4                   //nt
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x6, x4, #1                  //nt-1
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x6, x6, x0
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         w7,  [x6]
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sxtw        x7,w7
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v0.8b,w7                    //src[nt-1]
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x6, x4, x4,lsl #1           //3nt
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x6, x6, #1                  //3nt + 1
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x6, x6, x0
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         w7,  [x6]
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sxtw        x7,w7
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v1.8b,w7                    //src[3nt+1]
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x6, x4, x4                  //2nt
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x14, x6, #1                 //2nt+1
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x6, x6, #1                  //2nt-1
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x6, x6, x0                  //&src[2nt-1]
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x14, x14, x0                //&src[2nt+1]
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x8, #1                      //row+1 (row is first 0)
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x9, x4, x8                  //nt-1-row (row is first 0)
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v5.8b,w8                    //row + 1
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v6.8b,w9                    //nt - 1 - row
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         v7.8b, v5.8b                //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x12, x11, #1                //coeffs (to be reloaded after every row)
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x1, x4                      //nt (row counter) (dec after every row)
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x5, x2                      //dst (to be reloaded after every row and inc by dst_strd)
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x10, #8                     //increment for the coeffs
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x0, x14                     //&src[2nt+1] (to be reloaded after every row)
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         x4, #4
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         tf_sz_4
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//@ ========== ***************** =====================
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarprolog:
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakartf_sz_8_16_32:
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x7, x4                      //column counter (set to no of cols)
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsr         x9, x4, #3                  //divide nt by 8
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mul         x7, x7, x9                  //multiply width * height
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    adrp        x5, :got:gau1_ihevc_planar_factor_1 //loads table of coeffs
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         x5, [x5, #:got_lo12:gau1_ihevc_planar_factor_1]
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x6, x6, #7
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x8, x2
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         x9, x3, #3                  //4*stride
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x20, x9, #8                 //8-4*stride
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    neg         x9, x20
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x10, x4                     //nt
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x10, x10, #8                //nt - 8
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcol_loop_8_16_32:
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1789cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v17.8b},[x12]              //(1-8)load 8 coeffs [col+1]
1799cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    dup         v27.8h,w4                   //(1)
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v4.8b},[x6]                //(1-8)src[2nt-1-row]
1819cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v19.8b,  v2.8b ,  v17.8b    //(1-8)[nt-1-col]
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1849cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v27.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v3.8b},[x14]               //(1-8)load 8 src[2nt+1+col]
1879cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v27.8h, v17.8b, v1.8b       //(1)(col+1)    *    src[3nt+1]
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v20.8b, v4.8b[7]            //(1)
1909cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v27.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v21.8b, v4.8b[6]            //(2)
1939cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v27.8h, v19.8b, v20.8b      //(1)(nt-1-col)    *    src[2nt-1-row]
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v30.8h,w4                   //(2)
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(1)
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(1)
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v22.8b, v4.8b[5]            //(3)
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v30.8h, v5.8b, v0.8b        //(2)
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v28.8h,w4                   //(3)
2049cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v30.8h, v17.8b, v1.8b       //(2)
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v30.8h, v6.8b, v3.8b        //(2)
2079cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v30.8h, v19.8b, v21.8b      //(2)
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2099cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v27.8h, v27.8h, v29.8h      //(1)shr
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(2)
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(2)
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2149cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    xtn         v27.8b,  v27.8h             //(1)
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v28.8h, v5.8b, v0.8b        //(3)
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v23.8b, v4.8b[4]            //(4)
2189cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v28.8h, v17.8b, v1.8b       //(3)
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2209cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    dup         v25.8h,w4                   //(4)
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v28.8h, v6.8b, v3.8b        //(3)
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2239cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v27.8b},[x2], x3           //(1)str 8 values
2249cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v28.8h, v19.8b, v22.8b      //(3)
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2269cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v30.8h, v30.8h, v29.8h      //(2)shr
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(3)
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(3)
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v30.8b,  v30.8h             //(2)
2329cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v25.8h, v5.8b, v0.8b        //(4)
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v20.8b, v4.8b[3]            //(5)
2359cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v25.8h, v17.8b, v1.8b       //(4)
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v16.8h,w4                   //(5)
2389cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v25.8h, v6.8b, v3.8b        //(4)
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v30.8b},[x2], x3           //(2)str 8 values
2419cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v25.8h, v19.8b, v23.8b      //(4)
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2439cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v28.8h, v28.8h, v29.8h      //(3)shr
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(4)
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(4)
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v28.8b,  v28.8h             //(3)
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v16.8h, v5.8b, v0.8b        //(5)
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v21.8b, v4.8b[2]            //(6)
2529cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v16.8h, v17.8b, v1.8b       //(5)
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v18.8h,w4                   //(6)
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v16.8h, v6.8b, v3.8b        //(5)
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v28.8b},[x2], x3           //(3)str 8 values
2589cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v16.8h, v19.8b, v20.8b      //(5)
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v25.8h, v25.8h, v29.8h      //(4)shr
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(5)
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(5)
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2649cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    xtn         v25.8b,  v25.8h             //(4)
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v18.8h, v5.8b, v0.8b        //(6)
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v22.8b, v4.8b[1]            //(7)
2689cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v18.8h, v17.8b, v1.8b       //(6)
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v26.8h,w4                   //(7)
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v18.8h, v6.8b, v3.8b        //(6)
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2739cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v25.8b},[x2], x3           //(4)str 8 values
2749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v18.8h, v19.8b, v21.8b      //(6)
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2769cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v16.8h, v16.8h, v29.8h      //(5)shr
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(6)
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(6)
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v16.8b,  v16.8h             //(5)
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v26.8h, v5.8b, v0.8b        //(7)
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v23.8b, v4.8b[0]            //(8)
2859cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v26.8h, v17.8b, v1.8b       //(7)
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v24.8h,w4                   //(8)
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v26.8h, v6.8b, v3.8b        //(7)
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v16.8b},[x2], x3           //(5)str 8 values
2919cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v26.8h, v19.8b, v22.8b      //(7)
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2939cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v18.8h, v18.8h, v29.8h      //(6)shr
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(7)
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(7)
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v18.8b,  v18.8h             //(6)
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v24.8h, v5.8b, v0.8b        //(8)
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3029cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v24.8h, v17.8b, v1.8b       //(8)
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v24.8h, v6.8b, v3.8b        //(8)
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v18.8b},[x2], x3           //(6)str 8 values
3079cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v24.8h, v19.8b, v23.8b      //(8)
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3099cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v26.8h, v26.8h, v29.8h      //(7)shr
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x7, x7, #8
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         epilog
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x1, x1, #8                  //row counter
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x12, #8                //col inc
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x12, x20, x12,gt
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x14, #8                //also for col inc
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x14, x20, x14,gt
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x1, x4, x1,le               //nt reloaded (refresh the value)
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x11, #1                //x12 reset
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x12, x20, x12,le
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x14, x0, x14,le             //x14 reset
3259cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v17.8b},[x12]              //(1n)(1-8)load 8 coeffs [col+1]
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x20, x6, #8                 //for next set of rows
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x6, x20, x6,le
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v3.8b},[x14]               //(1n)(1-8)load 8 src[2nt+1+col]
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x5, #8
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x5, x20, x5,le
3339cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    dup         v27.8h,w4                   //(1n)(1)
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v5.8b},[x5]
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v4.8b},[x6]                //(1n)(1-8)src[2nt-1-row]
3389cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v19.8b,  v2.8b ,  v17.8b    //(1n)(1-8)[nt-1-col]
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v20.8b, v4.8b[7]            //(1n)(1)
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v2.8b ,  v5.8b
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         epilog
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarkernel_plnr:
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         x1, #0                      // (cond loop)
3489cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v24.8h, v24.8h, v29.8h      //(8)shr
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v26.8b,  v26.8h             //(7)
3519cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v27.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v24.8b,  v24.8h             //(8)
3549cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v27.8h, v17.8b, v1.8b       //(1)(col+1)    *    src[3nt+1]
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v21.8b, v4.8b[6]            //(2)
3579cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v27.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v30.8h,w4                   //(2)
3609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v27.8h, v19.8b, v20.8b      //(1)(nt-1-col)    *    src[2nt-1-row]
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v26.8b},[x2], x3           //(7)str 8 values
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(1)
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v24.8b},[x2], x3           //(8)str 8 values
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(1)
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x2, x9                 //since more cols to fill, dst + 8 - 6*strd (cond loop)
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x2, x20, x2,gt
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v30.8h, v5.8b, v0.8b        //(2)
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x20, x2, x10                //else go to next set of rows, dst - (nt-8) (cond loop)
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x2, x20, x2,le
3749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v30.8h, v17.8b, v1.8b       //(2)
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v22.8b, v4.8b[5]            //(3)
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v30.8h, v6.8b, v3.8b        //(2)
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v28.8h,w4                   //(3)
3809cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v30.8h, v19.8b, v21.8b      //(2)
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3829cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v27.8h, v27.8h, v29.8h      //(1)shr
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(2)
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x1, x4, x1,le               //nt reloaded (refresh the value)    (cond loop)
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(2)
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x1, x1, #8                  //row counter (loop)
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3909cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    xtn         v27.8b,  v27.8h             //(1)
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v28.8h, v5.8b, v0.8b        //(3)
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v23.8b, v4.8b[4]            //(4)
3949cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v28.8h, v17.8b, v1.8b       //(3)
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3969cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    dup         v25.8h,w4                   //(4)
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v28.8h, v6.8b, v3.8b        //(3)
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3999cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v27.8b},[x2], x3           //(1)str 8 values
4009cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v28.8h, v19.8b, v22.8b      //(3)
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4029cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v30.8h, v30.8h, v29.8h      //(2)shr
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(3)
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(3)
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v30.8b,  v30.8h             //(2)
4099cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v25.8h, v5.8b, v0.8b        //(4)
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v20.8b, v4.8b[3]            //(5)
4129cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v25.8h, v17.8b, v1.8b       //(4)
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v16.8h,w4                   //(5)
4159cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v25.8h, v6.8b, v3.8b        //(4)
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v30.8b},[x2], x3           //(2)str 8 values
4189cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v25.8h, v19.8b, v23.8b      //(4)
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4209cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v28.8h, v28.8h, v29.8h      //(3)shr
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(4)
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(4)
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v28.8b,  v28.8h             //(3)
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v16.8h, v5.8b, v0.8b        //(5)
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v21.8b, v4.8b[2]            //(6)
4309cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v16.8h, v17.8b, v1.8b       //(5)
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v18.8h,w4                   //(6)
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v16.8h, v6.8b, v3.8b        //(5)
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v28.8b},[x2], x3           //(3)str 8 values
4369cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v16.8h, v19.8b, v20.8b      //(5)
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x11, #1                //x12 reset (cond loop)
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x12, x20, x12,le
4409cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v25.8h, v25.8h, v29.8h      //(4)shr
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x12, #8                //col inc (cond loop)
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x12, x20, x12,gt
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(5)
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x14, #8                //also for col inc (cond loop)
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x14, x20, x14,gt
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(5)
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4509cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    xtn         v25.8b,  v25.8h             //(4)
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v18.8h, v5.8b, v0.8b        //(6)
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v22.8b, v4.8b[1]            //(7)
4549cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v18.8h, v17.8b, v1.8b       //(6)
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v26.8h,w4                   //(7)
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v18.8h, v6.8b, v3.8b        //(6)
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4599cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v25.8b},[x2], x3           //(4)str 8 values
4609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v18.8h, v19.8b, v21.8b      //(6)
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x14, x0, x14,le             //x14 reset (cond loop)
4639cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v16.8h, v16.8h, v29.8h      //(5)shr
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x20, x6, #8                 //for next set of rows (cond loop)
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x6, x20, x6,le
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(6)
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x5, #8                 // (cond loop)
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x5, x20, x5,le
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(6)
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v16.8b,  v16.8h             //(5)
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v26.8h, v5.8b, v0.8b        //(7)
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v23.8b, v4.8b[0]            //(8)
4779cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v26.8h, v17.8b, v1.8b       //(7)
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v24.8h,w4                   //(8)
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v26.8h, v6.8b, v3.8b        //(7)
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v16.8b},[x2], x3           //(5)str 8 values
4839cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v26.8h, v19.8b, v22.8b      //(7)
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v4.8b},[x6]                //(1n)(1-8)src[2nt-1-row]
4869cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v18.8h, v18.8h, v29.8h      //(6)shr
4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //(7)
4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //(7)
4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v18.8b,  v18.8h             //(6)
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v24.8h, v5.8b, v0.8b        //(8)
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v5.8b},[x5]                //(row+1 value)
4969cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v24.8h, v17.8b, v1.8b       //(8)
4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v20.8b, v4.8b[7]            //(1n)(1)
4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    umlal       v24.8h, v6.8b, v3.8b        //(8)
5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v18.8b},[x2], x3           //(6)str 8 values
5029cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v24.8h, v19.8b, v23.8b      //(8)
5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5049cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v17.8b},[x12]              //(1n)(1-8)load 8 coeffs [col+1]
5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v2.8b ,  v5.8b      //(nt-1-row) value
5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x7, x7, #8                  //col counter
5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v3.8b},[x14]               //(1n)(1-8)load 8 src[2nt+1+col]
5109cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v26.8h, v26.8h, v29.8h      //(7)shr
5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5129cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    dup         v27.8h,w4                   //(1n)(1)
5139cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v19.8b,  v2.8b ,  v17.8b    //(1n)(1-8)[nt-1-col]
5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bne         kernel_plnr
5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog:
5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v26.8b,  v26.8h             //(7)
5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v26.8b},[x2], x3           //(7)str 8 values
5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5229cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sshl        v24.8h, v24.8h, v29.8h      //(8)shr
5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v24.8b,  v24.8h             //(8)
5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v24.8b},[x2], x3           //(8)str 8 values
5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//@ ========== ***************** =====================
5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_loop
5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakartf_sz_4:
5319cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v25.8b},[x14]              //load src[2nt+1+col]
5329cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v17.8b},[x12], x10         //load 8 coeffs [col+1]
5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarloop_sz_4:
5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x10, #4                     //reduce inc to #4 for 4x4
5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         w7,  [x6], #-1              //src[2nt-1-row] (dec to take into account row)
5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sxtw        x7,w7
5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v4.8b,w7                    //src[2nt-1-row]
5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5399cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v19.8b,  v2.8b ,  v17.8b    //[nt-1-col]
5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5419cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umull       v27.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
5429cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v27.8h, v6.8b, v25.8b       //(nt-1-row)    *    src[2nt+1+col]
5439cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v27.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
5449cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    umlal       v27.8h, v19.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    vadd.i16    q6, q6, q8            @add (nt)
5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    vshl.s16     q6, q6, q7            @shr
5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    vmovn.i16     d12, q6
5489cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    rshrn       v27.8b, v27.8h,#3
5499cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v27.s}[0],[x2], x3
5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         v5.8b,  v5.8b ,  v7.8b      //row++ [(row+1)++]
5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         v6.8b,  v6.8b ,  v7.8b      //[nt-1-row]--
5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x1, x1, #1
5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bne         loop_sz_4
5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_loop:
5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldp         x19, x20,[sp],#16
5609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ret
5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
570