10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  ihevc_inter_pred_filters_luma_vert_w16inp.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  contains function definitions for inter prediction  interpolation.
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* functions are coded using neon  intrinsics and can be compiled using
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* rvct
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //author
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  yogeswaran rs
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //par list of functions:
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  - ihevc_inter_pred_luma_vert()
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //remarks
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  none
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///* include reconstruction */
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///**
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //brief
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*    luma vertical filter for 16bit input.
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //par description:
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*     the elements pointed by 'pu1_src' and  writes to the location pointed by
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*     clipped to lie  between 0 and 255   assumptions : the function is
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*     optimized considering the fact width is  multiple of 4. and height as
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*     multiple of 2.
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] pi2_src
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  word16 pointer to the source
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[out] pu1_dst
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  uword8 pointer to the destination
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] src_strd
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer source stride
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] dst_strd
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer destination stride
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] pi1_coeff
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  word8 pointer to the filter coefficients
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] ht
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer height of the array
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] wd
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer width of the array
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //returns
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //remarks
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  none
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                    uword8 *pu1_dst,
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                    word32 src_strd,
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                    word32 dst_strd,
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                    word8 *pi1_coeff,
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                    word32 ht,
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                    word32 wd   )
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//**************variables vs registers*****************************************
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  r0 => *pu2_src
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  r1 => *pu1_dst
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  r2 =>  src_strd
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  r3 =>  dst_strd
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  r4 => *pi1_coeff
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  r5 =>  ht
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  r6 =>  wd
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.align 4
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.include "ihevc_neon_macros.s"
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_inter_pred_luma_vert_w16inp_w16out_av8
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_inter_pred_luma_vert_w16inp_w16out_av8, %function
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_inter_pred_luma_vert_w16inp_w16out_av8:
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //stmfd     sp!, {r4-r12, r14}  //stack stores the values of the arguments
1179cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stp         x19,x20,[sp, #-16]!
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x15,x4 // pi1_coeff
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x16,x5 // ht
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x17,x6 // wd
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x12,x15                     //load pi1_coeff
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         x6,x3,#1
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x5,x17                      //load wd
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.8b},[x12]               //coeff = ld1_s8(pi1_coeff)
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         x2, x2,#1
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x12,x2,x2,lsl #2            //src_ctrd & pi1_coeff
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vabs.s8   d0,d0               //vabs_s8(coeff)
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x0,x0,x12                   //r0->pu1_src   r12->pi1_coeff
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x3,x16                      //load ht
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x7,x3,#0                    //r3->ht
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //ble       end_loops           //end loop jump
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sxtl        v0.8h,v0.8b
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v22.4h,v0.h[0]              //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v23.4h,v0.h[1]              //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v24.4h,v0.h[2]              //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v25.4h,v0.h[3]              //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v26.4h,v0.h[4]              //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v27.4h,v0.h[5]              //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v28.4h,v0.h[6]              //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v29.4h,v0.h[7]              //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v30.4s,#8, lsl #16
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x9,x5,x6,lsl #2             //r6->dst_strd  r5  ->wd
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    neg         x9,x9
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x8,x5,x2,lsl #2             //r2->src_strd
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    neg         x8,x8
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x8,x8,x5
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x9,x9,x5
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsr         x3, x5, #2                  //divide by 4
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mul         x7, x7, x3                  //multiply height by width
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x7, x7, #4                  //subtract by one for epilog
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x4,x5                       //r5 ->wd
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //mov           r2, r2, lsl #1
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarprolog:
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v1.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.4h},[x0], #8            //src_tmp1 = ld1_u8(pu1_src_tmp)//
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x4,x4,#4
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
1669cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v19.4s,v1.4h,v23.4h         //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
1689cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v0.4h,v22.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
1709cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v2.4h,v24.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
1729cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v3.4h,v25.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
1749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v4.4h,v26.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
1769cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v5.4h,v27.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
1779cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v6.4h,v28.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
1789cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v7.4h,v29.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1829cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v20.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20,x0,x8,lsl #0
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x0,x20,x0,le
1859cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x4,x5,x4,le
1879cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
1899cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
1919cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
1939cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
1949cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
1959cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
1969cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v19.4s, v19.4s, v30.4s
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v1.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
1999cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v21.4s,v3.4h,v23.4h
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.4h},[x0],#8             //src_tmp1 = ld1_u8(pu1_src_tmp)//
2019cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v2.4h,v22.4h
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
2039cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v4.4h,v24.4h
2049cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v5.4h,v25.4h
2059cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v6.4h,v26.4h
2069cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v7.4h,v27.4h
2079cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v16.4h,v28.4h
2089cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v17.4h,v29.4h
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x14,x1,x6
2109cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v20.4s, v20.4s, v30.4s
2119cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shrn        v19.4h, v19.4s, #6
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2149cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v31.4s,v4.4h,v23.4h
2159cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v3.4h,v22.4h
2169cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v5.4h,v24.4h
2179cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v6.4h,v25.4h
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
2199cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v7.4h,v26.4h
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
2219cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v16.4h,v27.4h
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
2239cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v17.4h,v28.4h
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
2259cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v18.4h,v29.4h
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2289cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v19.2s},[x1],#8            //st1_u8(pu1_dst,sto_res)//
2299cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v21.4s, v21.4s, v30.4s
2309cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shrn        v20.4h, v20.4s, #6
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x1, x9
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x1, x20, x1, le
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x7,x7,#4
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    blt         epilog_end                  //jumps to epilog_end
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         epilog                      //jumps to epilog
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarkernel_8:
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2439cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v19.4s,v1.4h,v23.4h         //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x4,x4,#4
2459cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v0.4h,v22.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20,x0,x8,lsl #0
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x0,x20,x0,le
2489cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v2.4h,v24.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
2499cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v3.4h,v25.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
2509cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v4.4h,v26.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
2519cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v5.4h,v27.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
2529cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v6.4h,v28.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
2539cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v7.4h,v29.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
2549cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v20.2s},[x14],x6           //st1_u8(pu1_dst_tmp,sto_res)//
2559cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
2569cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v31.4S, v31.4s, v30.4s
2579cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shrn        v21.4h, v21.4s, #6
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vqrshrun d12,q6,#6
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2619cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v20.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
2629cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
2639cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
2649cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
2659cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
2669cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
2679cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v21.2s},[x14],x6
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2699cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2729cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v19.4s, v19.4s, v30.4s
2759cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shrn        v31.4h, v31.4s, #6
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vqrshrun d14,q7,#6
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2789cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v21.4s,v3.4h,v23.4h
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x4,x5,x4,le
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2819cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v2.4h,v22.4h
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2849cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v4.4h,v24.4h
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2879cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v5.4h,v25.4h
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2899cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v6.4h,v26.4h
2909cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v31.2s},[x14],x6
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2929cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v7.4h,v27.4h
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v1.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2959cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v16.4h,v28.4h
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x14,x1,x6
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2989cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v17.4h,v29.4h
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.4h},[x0],#8             //src_tmp1 = ld1_u8(pu1_src_tmp)//
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3019cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v20.4s, v20.4s, v30.4s
3029cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shrn        v19.4h, v19.4s, #6
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3069cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v31.4s,v4.4h,v23.4h
3079cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v3.4h,v22.4h
3089cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v5.4h,v24.4h
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3119cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v6.4h,v25.4h
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
3139cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v7.4h,v26.4h
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
3159cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v16.4h,v27.4h
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
3179cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v17.4h,v28.4h
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
3199cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v18.4h,v29.4h
3209cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v19.2s},[x1],#8            //st1_u8(pu1_dst,sto_res)//
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3229cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v21.4s, v21.4s, v30.4s
3239cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shrn        v20.4h, v20.4s, #6
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20, x1, x9
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x1, x20, x1, le
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x7,x7,#4
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         kernel_8                    //jumps to kernel_8
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog:
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3349cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v19.4s,v1.4h,v23.4h         //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
3359cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v0.4h,v22.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
3369cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v2.4h,v24.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
3379cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v3.4h,v25.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
3389cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v4.4h,v26.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
3399cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v5.4h,v27.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
3409cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v6.4h,v28.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
3419cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v19.4s,v7.4h,v29.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
3429cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v20.2s},[x14],x6
3439cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
3449cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v31.4s, v31.4s, v30.4s
3459cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shrn        v21.4h, v21.4s, #6
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vqrshrun d12,q6,#6
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
3499cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v20.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
3509cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
3519cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
3529cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
3539cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
3549cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
3559cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
3569cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v20.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
3579cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v21.2s},[x14],x6
3589cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
3599cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v19.4s, v19.4s, v30.4s
3609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shrn        v31.4h, v31.4s, #6
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vqrshrun d14,q7,#6
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
3649cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v21.4s,v3.4h,v23.4h
3659cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v2.4h,v22.4h
3669cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v4.4h,v24.4h
3679cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v5.4h,v25.4h
3689cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v6.4h,v26.4h
3699cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v7.4h,v27.4h
3709cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v16.4h,v28.4h
3719cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v21.4s,v17.4h,v29.4h
3729cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v31.2s},[x14],x6
3739cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v20.4s, v20.4s, v30.4s
3749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shrn        v19.4h, v19.4s, #6
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
3789cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smull       v31.4s,v4.4h,v23.4h
3799cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v3.4h,v22.4h
3809cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v5.4h,v24.4h
3819cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v6.4h,v25.4h
3829cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v7.4h,v26.4h
3839cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v16.4h,v27.4h
3849cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v17.4h,v28.4h
3859cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    smlal       v31.4s,v18.4h,v29.4h
3869cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v21.4s, v21.4s, v30.4s
3879cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shrn        v20.4h, v20.4s, #6
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x14,x1,x6
3919cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v19.2s},[x1],#8            //st1_u8(pu1_dst,sto_res)//
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog_end:
3949cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v20.2s},[x14],x6           //st1_u8(pu1_dst_tmp,sto_res)//
3959cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shrn        v21.4h, v21.4s, #6
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vqrshrun d12,q6,#6
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3989cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v21.2s},[x14],x6
3999cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    sub         v31.4s, v31.4s, v30.4s
4009cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    shrn        v31.4h, v31.4s, #6
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //vqrshrun d14,q7,#6
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4039cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    st1         {v31.2s},[x14],x6
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_loops:
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //ldmfd     sp!,{r4-r12,r15}            //reload the registers from sp
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldp         x19, x20,[sp], #16
4109cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ret
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
419