10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///***************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*****************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ihevc_intra_pred_filters_planar.s 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* contains function definitions for inter prediction interpolation. 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* functions are coded using neon intrinsics and can be compiled using 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* rvct 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @author 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* akshaya mukund 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @par list of functions: 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @remarks 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* none 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************* 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/ 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///** 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************* 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @brief 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* luma intraprediction filter for planar input 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @par description: 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] pu1_ref 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* uword8 pointer to the source 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[out] pu1_dst 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* uword8 pointer to the destination 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] src_strd 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* integer source stride 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] dst_strd 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* integer destination stride 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] pi1_coeff 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* word8 pointer to the planar coefficients 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] nt 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* size of tranform block 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @param[in] mode 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* type of filtering 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @returns 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* @remarks 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* none 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************* 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/ 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//void ihevc_intra_pred_luma_planar(uword8* pu1_ref, 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// word32 src_strd, 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// uword8* pu1_dst, 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// word32 dst_strd, 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// word32 nt, 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// word32 mode, 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// word32 pi1_coeff) 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//**************variables vs registers***************************************** 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x0 => *pu1_ref 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x1 => src_strd 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x2 => *pu1_dst 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x3 => dst_strd 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//stack contents from #40 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// nt 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// mode 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// pi1_coeff 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.align 4 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.include "ihevc_neon_macros.s" 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_intra_pred_luma_planar_av8 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern gau1_ihevc_planar_factor 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern gau1_ihevc_planar_factor_1 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_intra_pred_luma_planar_av8, %function 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_intra_pred_luma_planar_av8: 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 1109cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar stp x19, x20,[sp,#-16]! 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar adrp x11, :got:gau1_ihevc_planar_factor //loads table of coeffs 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr x11, [x11, #:got_lo12:gau1_ihevc_planar_factor] 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar clz w5,w4 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x20, x5, #32 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar neg x5, x20 1199cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy dup v29.8h,w5 1209cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy neg v29.8h, v29.8h //shr value (so vneg) 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v2.8b,w4 //nt 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v16.8h,w4 //nt 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x6, x4, #1 //nt-1 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x6, x6, x0 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr w7, [x6] 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sxtw x7,w7 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v0.8b,w7 //src[nt-1] 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x6, x4, x4,lsl #1 //3nt 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x6, x6, #1 //3nt + 1 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x6, x6, x0 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr w7, [x6] 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sxtw x7,w7 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v1.8b,w7 //src[3nt+1] 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x6, x4, x4 //2nt 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x14, x6, #1 //2nt+1 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x6, x6, #1 //2nt-1 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x6, x6, x0 //&src[2nt-1] 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x14, x14, x0 //&src[2nt+1] 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x8, #1 //row+1 (row is first 0) 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x9, x4, x8 //nt-1-row (row is first 0) 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v5.8b,w8 //row + 1 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v6.8b,w9 //nt - 1 - row 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v7.8b, v5.8b //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x12, x11, #1 //coeffs (to be reloaded after every row) 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x1, x4 //nt (row counter) (dec after every row) 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x5, x2 //dst (to be reloaded after every row and inc by dst_strd) 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x10, #8 //increment for the coeffs 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x0, x14 //&src[2nt+1] (to be reloaded after every row) 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmp x4, #4 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar beq tf_sz_4 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//@ ========== ***************** ===================== 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarprolog: 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakartf_sz_8_16_32: 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x7, x4 //column counter (set to no of cols) 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lsr x9, x4, #3 //divide nt by 8 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mul x7, x7, x9 //multiply width * height 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar adrp x5, :got:gau1_ihevc_planar_factor_1 //loads table of coeffs 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr x5, [x5, #:got_lo12:gau1_ihevc_planar_factor_1] 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x6, x6, #7 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x8, x2 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lsl x9, x3, #3 //4*stride 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x20, x9, #8 //8-4*stride 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar neg x9, x20 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x10, x4 //nt 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x10, x10, #8 //nt - 8 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcol_loop_8_16_32: 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1789cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v17.8b},[x12] //(1-8)load 8 coeffs [col+1] 1799cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy dup v27.8h,w4 //(1) 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ld1 {v4.8b},[x6] //(1-8)src[2nt-1-row] 1819cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sub v19.8b, v2.8b , v17.8b //(1-8)[nt-1-col] 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1849cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v27.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1] 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ld1 {v3.8b},[x14] //(1-8)load 8 src[2nt+1+col] 1879cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v27.8h, v17.8b, v1.8b //(1)(col+1) * src[3nt+1] 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v20.8b, v4.8b[7] //(1) 1909cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v27.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col] 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v21.8b, v4.8b[6] //(2) 1939cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v27.8h, v19.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row] 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v30.8h,w4 //(2) 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(1) 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(1) 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v22.8b, v4.8b[5] //(3) 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v30.8h, v5.8b, v0.8b //(2) 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v28.8h,w4 //(3) 2049cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v30.8h, v17.8b, v1.8b //(2) 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v30.8h, v6.8b, v3.8b //(2) 2079cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v30.8h, v19.8b, v21.8b //(2) 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2099cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v27.8h, v27.8h, v29.8h //(1)shr 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(2) 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(2) 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2149cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy xtn v27.8b, v27.8h //(1) 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v28.8h, v5.8b, v0.8b //(3) 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v23.8b, v4.8b[4] //(4) 2189cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v28.8h, v17.8b, v1.8b //(3) 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2209cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy dup v25.8h,w4 //(4) 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v28.8h, v6.8b, v3.8b //(3) 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2239cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy st1 {v27.8b},[x2], x3 //(1)str 8 values 2249cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v28.8h, v19.8b, v22.8b //(3) 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2269cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v30.8h, v30.8h, v29.8h //(2)shr 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(3) 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(3) 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v30.8b, v30.8h //(2) 2329cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v25.8h, v5.8b, v0.8b //(4) 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v20.8b, v4.8b[3] //(5) 2359cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v25.8h, v17.8b, v1.8b //(4) 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v16.8h,w4 //(5) 2389cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v25.8h, v6.8b, v3.8b //(4) 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v30.8b},[x2], x3 //(2)str 8 values 2419cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v25.8h, v19.8b, v23.8b //(4) 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2439cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v28.8h, v28.8h, v29.8h //(3)shr 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(4) 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(4) 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v28.8b, v28.8h //(3) 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v16.8h, v5.8b, v0.8b //(5) 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v21.8b, v4.8b[2] //(6) 2529cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v16.8h, v17.8b, v1.8b //(5) 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v18.8h,w4 //(6) 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v16.8h, v6.8b, v3.8b //(5) 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v28.8b},[x2], x3 //(3)str 8 values 2589cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v16.8h, v19.8b, v20.8b //(5) 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v25.8h, v25.8h, v29.8h //(4)shr 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(5) 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(5) 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2649cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy xtn v25.8b, v25.8h //(4) 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v18.8h, v5.8b, v0.8b //(6) 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v22.8b, v4.8b[1] //(7) 2689cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v18.8h, v17.8b, v1.8b //(6) 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v26.8h,w4 //(7) 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v18.8h, v6.8b, v3.8b //(6) 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2739cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy st1 {v25.8b},[x2], x3 //(4)str 8 values 2749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v18.8h, v19.8b, v21.8b //(6) 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2769cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v16.8h, v16.8h, v29.8h //(5)shr 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(6) 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(6) 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v16.8b, v16.8h //(5) 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v26.8h, v5.8b, v0.8b //(7) 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v23.8b, v4.8b[0] //(8) 2859cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v26.8h, v17.8b, v1.8b //(7) 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v24.8h,w4 //(8) 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v26.8h, v6.8b, v3.8b //(7) 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v16.8b},[x2], x3 //(5)str 8 values 2919cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v26.8h, v19.8b, v22.8b //(7) 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2939cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v18.8h, v18.8h, v29.8h //(6)shr 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(7) 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(7) 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v18.8b, v18.8h //(6) 2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v24.8h, v5.8b, v0.8b //(8) 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3029cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v24.8h, v17.8b, v1.8b //(8) 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v24.8h, v6.8b, v3.8b //(8) 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v18.8b},[x2], x3 //(6)str 8 values 3079cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v24.8h, v19.8b, v23.8b //(8) 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3099cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v26.8h, v26.8h, v29.8h //(7)shr 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar subs x7, x7, #8 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar beq epilog 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar subs x1, x1, #8 //row counter 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20, x12, #8 //col inc 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x12, x20, x12,gt 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20, x14, #8 //also for col inc 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x14, x20, x14,gt 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x1, x4, x1,le //nt reloaded (refresh the value) 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20, x11, #1 //x12 reset 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x12, x20, x12,le 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x14, x0, x14,le //x14 reset 3259cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v17.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1] 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x20, x6, #8 //for next set of rows 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x6, x20, x6,le 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ld1 {v3.8b},[x14] //(1n)(1-8)load 8 src[2nt+1+col] 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20, x5, #8 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x5, x20, x5,le 3339cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy dup v27.8h,w4 //(1n)(1) 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ld1 {v5.8b},[x5] 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ld1 {v4.8b},[x6] //(1n)(1-8)src[2nt-1-row] 3389cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sub v19.8b, v2.8b , v17.8b //(1n)(1-8)[nt-1-col] 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v20.8b, v4.8b[7] //(1n)(1) 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v2.8b , v5.8b 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar beq epilog 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarkernel_plnr: 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmp x1, #0 // (cond loop) 3489cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v24.8h, v24.8h, v29.8h //(8)shr 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v26.8b, v26.8h //(7) 3519cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v27.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1] 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v24.8b, v24.8h //(8) 3549cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v27.8h, v17.8b, v1.8b //(1)(col+1) * src[3nt+1] 3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v21.8b, v4.8b[6] //(2) 3579cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v27.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col] 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v30.8h,w4 //(2) 3609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v27.8h, v19.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row] 3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v26.8b},[x2], x3 //(7)str 8 values 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(1) 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v24.8b},[x2], x3 //(8)str 8 values 3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(1) 3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20, x2, x9 //since more cols to fill, dst + 8 - 6*strd (cond loop) 3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x2, x20, x2,gt 3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v30.8h, v5.8b, v0.8b //(2) 3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x20, x2, x10 //else go to next set of rows, dst - (nt-8) (cond loop) 3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x2, x20, x2,le 3749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v30.8h, v17.8b, v1.8b //(2) 3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v22.8b, v4.8b[5] //(3) 3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v30.8h, v6.8b, v3.8b //(2) 3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v28.8h,w4 //(3) 3809cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v30.8h, v19.8b, v21.8b //(2) 3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3829cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v27.8h, v27.8h, v29.8h //(1)shr 3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(2) 3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x1, x4, x1,le //nt reloaded (refresh the value) (cond loop) 3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(2) 3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar subs x1, x1, #8 //row counter (loop) 3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3909cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy xtn v27.8b, v27.8h //(1) 3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v28.8h, v5.8b, v0.8b //(3) 3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v23.8b, v4.8b[4] //(4) 3949cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v28.8h, v17.8b, v1.8b //(3) 3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3969cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy dup v25.8h,w4 //(4) 3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v28.8h, v6.8b, v3.8b //(3) 3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3999cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy st1 {v27.8b},[x2], x3 //(1)str 8 values 4009cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v28.8h, v19.8b, v22.8b //(3) 4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4029cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v30.8h, v30.8h, v29.8h //(2)shr 4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(3) 4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(3) 4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v30.8b, v30.8h //(2) 4099cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v25.8h, v5.8b, v0.8b //(4) 4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v20.8b, v4.8b[3] //(5) 4129cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v25.8h, v17.8b, v1.8b //(4) 4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v16.8h,w4 //(5) 4159cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v25.8h, v6.8b, v3.8b //(4) 4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v30.8b},[x2], x3 //(2)str 8 values 4189cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v25.8h, v19.8b, v23.8b //(4) 4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4209cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v28.8h, v28.8h, v29.8h //(3)shr 4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(4) 4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(4) 4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v28.8b, v28.8h //(3) 4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v16.8h, v5.8b, v0.8b //(5) 4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v21.8b, v4.8b[2] //(6) 4309cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v16.8h, v17.8b, v1.8b //(5) 4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v18.8h,w4 //(6) 4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v16.8h, v6.8b, v3.8b //(5) 4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v28.8b},[x2], x3 //(3)str 8 values 4369cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v16.8h, v19.8b, v20.8b //(5) 4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20, x11, #1 //x12 reset (cond loop) 4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x12, x20, x12,le 4409cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v25.8h, v25.8h, v29.8h //(4)shr 4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20, x12, #8 //col inc (cond loop) 4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x12, x20, x12,gt 4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(5) 4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20, x14, #8 //also for col inc (cond loop) 4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x14, x20, x14,gt 4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(5) 4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4509cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy xtn v25.8b, v25.8h //(4) 4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v18.8h, v5.8b, v0.8b //(6) 4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v22.8b, v4.8b[1] //(7) 4549cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v18.8h, v17.8b, v1.8b //(6) 4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v26.8h,w4 //(7) 4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v18.8h, v6.8b, v3.8b //(6) 4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4599cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy st1 {v25.8b},[x2], x3 //(4)str 8 values 4609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v18.8h, v19.8b, v21.8b //(6) 4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x14, x0, x14,le //x14 reset (cond loop) 4639cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v16.8h, v16.8h, v29.8h //(5)shr 4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x20, x6, #8 //for next set of rows (cond loop) 4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x6, x20, x6,le 4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(6) 4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20, x5, #8 // (cond loop) 4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x5, x20, x5,le 4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(6) 4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v16.8b, v16.8h //(5) 4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v26.8h, v5.8b, v0.8b //(7) 4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v23.8b, v4.8b[0] //(8) 4779cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v26.8h, v17.8b, v1.8b //(7) 4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v24.8h,w4 //(8) 4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v26.8h, v6.8b, v3.8b //(7) 4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v16.8b},[x2], x3 //(5)str 8 values 4839cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v26.8h, v19.8b, v22.8b //(7) 4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ld1 {v4.8b},[x6] //(1n)(1-8)src[2nt-1-row] 4869cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v18.8h, v18.8h, v29.8h //(6)shr 4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //(7) 4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //(7) 4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v18.8b, v18.8h //(6) 4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v24.8h, v5.8b, v0.8b //(8) 4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ld1 {v5.8b},[x5] //(row+1 value) 4969cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v24.8h, v17.8b, v1.8b //(8) 4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v20.8b, v4.8b[7] //(1n)(1) 4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar umlal v24.8h, v6.8b, v3.8b //(8) 5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v18.8b},[x2], x3 //(6)str 8 values 5029cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v24.8h, v19.8b, v23.8b //(8) 5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5049cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v17.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1] 5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v2.8b , v5.8b //(nt-1-row) value 5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar subs x7, x7, #8 //col counter 5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ld1 {v3.8b},[x14] //(1n)(1-8)load 8 src[2nt+1+col] 5109cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v26.8h, v26.8h, v29.8h //(7)shr 5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5129cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy dup v27.8h,w4 //(1n)(1) 5139cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sub v19.8b, v2.8b , v17.8b //(1n)(1-8)[nt-1-col] 5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar bne kernel_plnr 5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog: 5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v26.8b, v26.8h //(7) 5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v26.8b},[x2], x3 //(7)str 8 values 5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5229cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sshl v24.8h, v24.8h, v29.8h //(8)shr 5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v24.8b, v24.8h //(8) 5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v24.8b},[x2], x3 //(8)str 8 values 5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//@ ========== ***************** ===================== 5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar beq end_loop 5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakartf_sz_4: 5319cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v25.8b},[x14] //load src[2nt+1+col] 5329cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v17.8b},[x12], x10 //load 8 coeffs [col+1] 5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarloop_sz_4: 5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x10, #4 //reduce inc to #4 for 4x4 5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr w7, [x6], #-1 //src[2nt-1-row] (dec to take into account row) 5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sxtw x7,w7 5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dup v4.8b,w7 //src[2nt-1-row] 5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5399cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy sub v19.8b, v2.8b , v17.8b //[nt-1-col] 5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5419cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umull v27.8h, v5.8b, v0.8b //(row+1) * src[nt-1] 5429cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v27.8h, v6.8b, v25.8b //(nt-1-row) * src[2nt+1+col] 5439cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v27.8h, v17.8b, v1.8b //(col+1) * src[3nt+1] 5449cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy umlal v27.8h, v19.8b, v4.8b //(nt-1-col) * src[2nt-1-row] 5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// vadd.i16 q6, q6, q8 @add (nt) 5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// vshl.s16 q6, q6, q7 @shr 5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// vmovn.i16 d12, q6 5489cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy rshrn v27.8b, v27.8h,#3 5499cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy st1 {v27.s}[0],[x2], x3 5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add v5.8b, v5.8b , v7.8b //row++ [(row+1)++] 5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub v6.8b, v6.8b , v7.8b //[nt-1-row]-- 5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar subs x1, x1, #1 5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar bne loop_sz_4 5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_loop: 5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x19, x20,[sp],#16 5609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy 5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ret 5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 570