18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//******************************************************************************
28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Copyright (C) 2015 The Android Open Source Project
48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Licensed under the Apache License, Version 2.0 (the "License");
68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* you may not use this file except in compliance with the License.
78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* You may obtain a copy of the License at:
88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* http://www.apache.org/licenses/LICENSE-2.0
108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Unless required by applicable law or agreed to in writing, software
128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* distributed under the License is distributed on an "AS IS" BASIS,
138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* See the License for the specific language governing permissions and
158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* limitations under the License.
168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*****************************************************************************
188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*/
208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S///**
218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//******************************************************************************
228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @file
238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  ih264_inter_pred_luma_vert_qpel_av8.s
248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @brief
268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  Contains function definitions for inter prediction vertical quarter pel interpolation.
278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @author
298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  Mohit
308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @par List of Functions:
328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  - ih264_inter_pred_luma_vert_qpel_av8()
348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @remarks
368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  None
378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*******************************************************************************
398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*/
408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S///* All the functions here are replicated from ih264_inter_pred_filters.c
428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//
438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S///**
458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S///**
468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*******************************************************************************
478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @brief
498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*     Quarter pel interprediction luma filter for vertical input
508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @par Description:
528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* sec 8.4.2.2.1 titled "Luma sample interpolation process"
548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] pu1_src
568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  UWORD8 pointer to the source
578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[out] pu1_dst
598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  UWORD8 pointer to the destination
608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] src_strd
628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  integer source stride
638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] dst_strd
658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  integer destination stride
668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] ht
688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  integer height of the array
698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] wd
718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  integer width of the array
728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] pu1_tmp: temporary buffer: UNUSED in this function
748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] dydx: x and y reference offset for qpel calculations.
768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @returns
778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// @remarks
798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  None
808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*******************************************************************************
828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*/
838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//void ih264_inter_pred_luma_vert (
858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            UWORD8 *pu1_src,
868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            UWORD8 *pu1_dst,
878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            WORD32 src_strd,
888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            WORD32 dst_strd,
898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            WORD32 ht,
908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            WORD32 wd,
918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                              UWORD8* pu1_tmp,
928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                             UWORD32 dydx)
938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//**************Variables Vs Registers*****************************************
958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//    x0 => *pu1_src
968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//    x1 => *pu1_dst
97d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo//    w2 =>  src_strd
98d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo//    w3 =>  dst_strd
99d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo//    w4 =>  ht
100d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo//    w5 =>  wd
101d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo//    w7 =>  dydx
1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.text
1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.p2align 2
1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.include "ih264_neon_macros.s"
1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_inter_pred_luma_vert_qpel_av8
1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_inter_pred_luma_vert_qpel_av8:
1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    push_v_regs
1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    stp       x19, x20, [sp, #-16]!
115d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo    sxtw      x2, w2
116d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo    sxtw      x3, w3
117d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo    sxtw      x4, w4
118d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo    sxtw      x5, w5
1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    and       x7, x7, #12               //Finds y-offset
1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    lsr       x7, x7, #3                //dydx>>3
1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mul       x7, x2, x7
1248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x7, x0, x7                //pu1_src + (y_offset>>1)*src_strd
1258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub       x14, x4, #16
1268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    movi      v22.8h, #20               // Filter coeff 0x14 into Q11
1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x12, x5, #8               //if wd=8 branch to loop_8
1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    movi      v24.8h, #5                // Filter coeff 0x4  into Q12
1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    beq       loop_8_start
1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x12, x5, #4               //if wd=4 branch to loop_4
1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    beq       loop_4_start
1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x14, x14, #1              //for checking loop
1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v4.8b, v6.8b      // temp1 = src[2_0] + src[3_0]
1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sloop_16:                                //when  wd=16
1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v0.8b, v10.8b     // temp = src[0_0] + src[5_0]
1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v2.8b, v8.8b      // temp2 = src[1_0] + src[4_0]
1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v14.8h, v12.8h , v22.8h   // temp += temp1 * 20
1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v20.8h, v1.8b, v11.8b     // temp4 = src[0_8] + src[5_8]
1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v5.8b, v7.8b      // temp3 = src[2_8] + src[3_8]
1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v20.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v0.2s, v1.2s}, [x0], x2
1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v26.8h, v3.8b, v9.8b      // temp5 = src[1_8] + src[4_8]
1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v6.8b, v8.8b
1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v14.8h, v16.8h , v24.8h   // temp -= temp2 * 5
1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v2.8b, v0.8b
1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v4.8b, v10.8b
1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v16.8h, v12.8h , v22.8h
1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v20.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v26.8h, v5.8b, v11.8b
1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v7.8b, v9.8b
1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v30.8b, v14.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
1648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v3.8b, v1.8b
1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v2.2s, v3.2s}, [x0], x2
1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v14.8h, v12.8h , v22.8h
1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v16.8h, v18.8h , v24.8h
1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 0
1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v4.8b, v2.8b
1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v8.8b, v10.8b
1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0]
1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v18.8h, v12.8h , v22.8h
1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v20.8h, v6.8b, v0.8b
1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v14.8h, v26.8h , v24.8h
1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v30.8b, v16.8h, #5
1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v9.8b, v11.8b
1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v5.8b, v3.8b
1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v26.8h, v7.8b, v1.8b
1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v16.8h, v12.8h , v22.8h
1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v18.8h, v20.8h , v24.8h
1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v4.2s, v5.2s}, [x0], x2
1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v31.8b, v14.8h, #5
1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 1
1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v10.8b, v0.8b
1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v6.8b, v4.8b
1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v20.8h, v8.8b, v2.8b
1928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v14.8h, v12.8h , v22.8h
1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v16.8h, v26.8h , v24.8h
1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v30.2s, v31.2s}, [x1], x3 //store row 1
1958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v30.8b, v18.8h, #5
1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v7.8b, v5.8b
1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v11.8b, v1.8b
1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v18.8h, v12.8h , v22.8h
1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v26.8h, v9.8b, v3.8b
2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v14.8h, v20.8h , v24.8h
2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v6.2s, v7.2s}, [x0], x2
2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v31.8b, v16.8h, #5
2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v16.2s, v17.2s}, [x7], x2 // Load for interpolation row 2
2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v18.8h, v26.8h , v24.8h
2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v30.16b, v16.16b , v30.16b // Interpolation to obtain qpel value
2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v31.16b, v17.16b , v31.16b // Interpolation to obtain qpel value
2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v0.8b, v2.8b      // temp1 = src[2_0] + src[3_0]
2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v30.2s, v31.2s}, [x1], x3 //store row 2
2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v10.8b, v4.8b     // temp2 = src[1_0] + src[4_0]
2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v20.8h, v9.8b, v7.8b      // temp4 = src[0_8] + src[5_8]
2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v30.8b, v14.8h, #5
2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v26.8h, v5.8b, v11.8b     // temp5 = src[1_8] + src[4_8]
2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v8.8b, v6.8b      // temp = src[0_0] + src[5_0]
2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v31.8b, v18.8h, #5
2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v18.2s, v19.2s}, [x7], x2 // Load for interpolation row 3
2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v14.8h, v12.8h , v22.8h   // temp += temp1 * 20
2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v30.16b, v18.16b , v30.16b // Interpolation to obtain qpel value
2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v31.16b, v19.16b , v31.16b // Interpolation to obtain qpel value
2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v1.8b, v3.8b      // temp3 = src[2_8] + src[3_8]
2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v30.2s, v31.2s}, [x1], x3 //store row 3
2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // 4 rows processed
2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v20.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v8.2s, v9.2s}, [x0], x2
2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v2.8b, v4.8b
2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v3.8b, v5.8b
2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v14.8h, v16.8h , v24.8h   // temp -= temp2 * 5
2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v28.8h, v9.8b, v11.8b
2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v6.8b, v0.8b
2298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v28.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v20.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v26.8h, v1.8b, v7.8b
2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v5.8b, v7.8b
2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v30.8b, v14.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v8.8b, v10.8b
2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 4
2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v10.2s, v11.2s}, [x0], x2
2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v28.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v30.2s, v31.2s}, [x1], x3 //  store row 4
2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v14.8h, v12.8h , v22.8h   // temp += temp1 * 20
2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v20.8h, v11.8b, v1.8b
2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v26.8h, v3.8b, v9.8b
2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v20.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v6.8b, v4.8b
2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v7.8b, v9.8b
2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v31.8b, v28.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v14.8h, v16.8h , v24.8h   // temp -= temp2 * 5
2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v8.8b, v2.8b
2518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v30.8b, v14.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
2528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 5
2538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v20.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
2548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
2558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
2568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v10.8b, v0.8b
2578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v30.2s, v31.2s}, [x1], x3 //  store row 5
2588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v14.8h, v12.8h , v22.8h   // temp += temp1 * 20
2598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v0.2s, v1.2s}, [x0], x2
2608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v26.8h, v5.8b, v11.8b
2618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v8.8b, v6.8b
2628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v28.8h, v0.8b, v2.8b
2638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
2648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v28.8h, v12.8h , v22.8h   // temp += temp1 * 20
2658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v20.8h, v1.8b, v3.8b
2668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v14.8h, v16.8h , v24.8h   // temp -= temp2 * 5
2678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v20.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
2688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v10.8b, v4.8b
2698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v30.8b, v14.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
2708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 6
2718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov       v2.8b, v6.8b
2728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov       v3.8b, v7.8b
2738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
2748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
2758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v28.8h, v16.8h , v24.8h   // temp -= temp2 * 5
2778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v30.2s, v31.2s}, [x1], x3 //  store row 6
2788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v30.8b, v28.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
2798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    swp       v0.8b, v4.8b              // swapping registers to put it in order
2808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    swp       v1.8b, v5.8b              // swapping registers to put it in order
2818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v20.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
2838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov       v6.8b, v10.8b
2848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov       v7.8b, v11.8b
2858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x12, x14, #1              // if height==16  - looping
2868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    swp       v4.8b, v8.8b
2878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    swp       v5.8b, v9.8b
2888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
2898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 7
2908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
2918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
2928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v30.2s, v31.2s}, [x1], x3 //  store row 7
2938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bne       end_func                  //if height =8  end function
2948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x14, x14, #1              //for checking loop
2958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v10.2s, v11.2s}, [x0], x2
2968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v4.8b, v6.8b      // temp1 = src[2_0] + src[3_0]
2978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    b         loop_16                   // looping if height =16
2998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sloop_8_start:
3018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//// Processing row0 and row1
3028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v0.2s}, [x0], x2         // Vector load from src[0_0]
3048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v1.2s}, [x0], x2         // Vector load from src[1_0]
3058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v2.2s}, [x0], x2         // Vector load from src[2_0]
3068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v3.2s}, [x0], x2         // Vector load from src[3_0]
3078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x14, x14, #1              //for checking loop
3088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v4.2s}, [x0], x2         // Vector load from src[4_0]
3098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v5.2s}, [x0], x2         // Vector load from src[5_0]
3108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sloop_8:
3128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                        //for checking loop
3138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v6.8h, v2.8b, v3.8b       // temp1 = src[2_0] + src[3_0]
3148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v8.8h, v0.8b, v5.8b       // temp = src[0_0] + src[5_0]
3158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v10.8h, v1.8b, v4.8b      // temp2 = src[1_0] + src[4_0]
3168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v8.8h, v6.8h , v22.8h     // temp += temp1 * 20
3178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v6.2s}, [x0], x2
3188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v3.8b, v4.8b
3198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v1.8b, v6.8b
3208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v2.8b, v5.8b
3218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v8.8h, v10.8h , v24.8h    // temp -= temp2 * 5
3228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v16.8h, v14.8h , v22.8h
3238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v7.2s}, [x0], x2
3248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v20.8h, v4.8b, v5.8b
3258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v2.8b, v7.8b
3268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v10.8h, v3.8b, v6.8b
3278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v16.8h, v18.8h , v24.8h
3288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v26.8b, v8.8h, #5         // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
3298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v12.8h, v20.8h , v22.8h
3308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v8.2s}, [x7], x2         //Load value for interpolation            (row0)
3318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v9.2s}, [x7], x2         //Load value for interpolation            (row1)
3328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v0.2s}, [x0], x2
3338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v5.8b, v6.8b
3348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v27.8b, v16.8h, #5
3358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v26.16b, v8.16b , v26.16b // Interpolation step for qpel calculation
3368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v27.16b, v9.16b , v27.16b // Interpolation step for qpel calculation
3378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v20.8h, v3.8b, v0.8b
3398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v12.8h, v10.8h , v24.8h
3408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v26.2s}, [x1], x3        // Vector store to dst[0_0]
3418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v4.8b, v7.8b
3428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v20.8h, v14.8h , v22.8h
3438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v27.2s}, [x1], x3        // Vector store to dst[1_0]
3448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v28.8b, v12.8h, #5
3458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v20.8h, v18.8h , v24.8h
3468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (row2)
3478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (row3)
3488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v1.2s}, [x0], x2
3498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v29.8b, v20.8h, #5
3508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x9, x4, #4
3518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v28.16b, v12.16b , v28.16b
3528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v29.16b, v13.16b , v29.16b
3538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v28.2s}, [x1], x3        //store row 2
3548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v29.2s}, [x1], x3        //store row 3
3558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    beq       end_func                  // Branch if height==4
3568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v6.8b, v7.8b      // temp1 = src[2_0] + src[3_0]
3578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v0.8b, v5.8b      // temp = src[0_0] + src[5_0]
3588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v1.8b, v4.8b      // temp2 = src[1_0] + src[4_0]
3598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v18.8h, v14.8h , v22.8h   // temp += temp1 * 20
3608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v2.2s}, [x0], x2
3618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v18.8h, v16.8h , v24.8h   // temp -= temp2 * 5
3628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v8.8h, v0.8b, v7.8b
3638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v10.8h, v1.8b, v6.8b
3648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v2.8b, v5.8b
3658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v26.8b, v18.8h, #5
3668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v12.8h, v8.8h , v22.8h
3678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v18.2s}, [x7], x2        //Load value for interpolation            (row4)
3688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v19.2s}, [x7], x2        //Load value for interpolation            (row5)
3698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v3.2s}, [x0], x2
3708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v12.8h, v10.8h , v24.8h
3718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v27.8b, v12.8h, #5
3728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation
3738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation
3748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v26.2s}, [x1], x3        // store row 4
3768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v27.2s}, [x1], x3        // store row 5
3778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v0.8b, v1.8b      // temp1 = src[2_0] + src[3_0]
3788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v2.8b, v7.8b      // temp = src[0_0] + src[5_0]
3798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v3.8b, v6.8b      // temp2 = src[1_0] + src[4_0]
3808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v18.8h, v14.8h , v22.8h   // temp += temp1 * 20
3818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v4.2s}, [x0], x2
3828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v18.8h, v16.8h , v24.8h   // temp -= temp2 * 5
3838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v8.8h, v2.8b, v1.8b
3848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v10.8h, v3.8b, v0.8b
3858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v4.8b, v7.8b
3868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v26.8b, v18.8h, #5
3878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v12.8h, v8.8h , v22.8h
3888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v18.2s}, [x7], x2        //Load value for interpolation            (row6)
3898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v19.2s}, [x7], x2        //Load value for interpolation            (row7)
3908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v5.2s}, [x0], x2
3918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v12.8h, v10.8h , v24.8h
3928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v27.8b, v12.8h, #5
3938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation
3948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation
3958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x12, x14, #1
3978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v26.2s}, [x1], x3        // store row 6
3988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v27.2s}, [x1], x3        // store row 7
3998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x14, x14, #1
4008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    beq       loop_8                    //looping if height ==16
4018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    b         end_func
4038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sloop_4_start:
4068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//// Processing row0 and row1
4078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v0.s}[0], [x0], x2       // Vector load from src[0_0]
4108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v1.s}[0], [x0], x2       // Vector load from src[1_0]
4118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v2.s}[0], [x0], x2       // Vector load from src[2_0]
4128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v3.s}[0], [x0], x2       // Vector load from src[3_0]
4138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v4.s}[0], [x0], x2       // Vector load from src[4_0]
4148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v5.s}[0], [x0], x2       // Vector load from src[5_0]
4158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v6.8h, v2.8b, v3.8b       // temp1 = src[2_0] + src[3_0]
4178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v8.8h, v0.8b, v5.8b       // temp = src[0_0] + src[5_0]
4188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v10.8h, v1.8b, v4.8b      // temp2 = src[1_0] + src[4_0]
4198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v8.8h, v6.8h , v22.8h     // temp += temp1 * 20
4208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v6.2s}, [x0], x2
4218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v3.8b, v4.8b
4228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v1.8b, v6.8b
4238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v2.8b, v5.8b
4248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v8.8h, v10.8h , v24.8h    // temp -= temp2 * 5
4258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v7.s}[0], [x0], x2
4268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v16.8h, v14.8h , v22.8h
4278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v20.8h, v4.8b, v5.8b
4288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v2.8b, v7.8b
4298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v10.8h, v3.8b, v6.8b
4308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v16.8h, v18.8h , v24.8h
4318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v26.8b, v8.8h, #5         // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
4328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v8.s}[0], [x7], x2       //Load value for interpolation - row 0
4338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v9.s}[0], [x7], x2       //Load value for interpolation - row 1
4348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v12.8h, v20.8h , v22.8h
4358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v0.s}[0], [x0], x2
4368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v5.8b, v6.8b
4378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v27.8b, v16.8h, #5
4388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v20.8h, v3.8b, v0.8b
4398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v26.16b, v26.16b , v8.16b //Interpolation step for qpel calculation
4408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v27.16b, v27.16b , v9.16b //Interpolation step for qpel calculation
4418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v12.8h, v10.8h , v24.8h
4438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v26.s}[0], [x1], x3      // Vector store to dst[0_0]
4448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v4.8b, v7.8b
4458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v20.8h, v14.8h , v22.8h
4468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v27.s}[0], [x1], x3      // store row 1
4478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v28.8b, v12.8h, #5
4488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v12.s}[0], [x7], x2      //Load value for interpolation - row 2
4498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v13.s}[0], [x7], x2      //Load value for interpolation - row 3
4508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v20.8h, v18.8h , v24.8h
4528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v1.s}[0], [x0], x2
4538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v29.8b, v20.8h, #5
4548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v28.16b, v12.16b , v28.16b //Interpolation step for qpel calculation
4558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v29.16b, v13.16b , v29.16b //Interpolation step for qpel calculation
4568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v28.s}[0], [x1], x3      //store row 2
4588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v29.s}[0], [x1], x3      //store row 3
4598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x9, x4, #4
4618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    beq       end_func                  // Branch if height==4
4628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v6.8b, v7.8b      // temp1 = src[2_0] + src[3_0]
4658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v0.8b, v5.8b      // temp = src[0_0] + src[5_0]
4668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v1.8b, v4.8b      // temp2 = src[1_0] + src[4_0]
4678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v18.8h, v14.8h , v22.8h   // temp += temp1 * 20
4688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v2.s}[0], [x0], x2
4698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v18.8h, v16.8h , v24.8h   // temp -= temp2 * 5
4708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v8.8h, v0.8b, v7.8b
4718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v10.8h, v1.8b, v6.8b
4728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v2.8b, v5.8b
4738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v26.8b, v18.8h, #5
4748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v18.s}[0], [x7], x2      //Load value for interpolation - row 4
4758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v19.s}[0], [x7], x2      //Load value for interpolation - row 5
4768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v12.8h, v8.8h , v22.8h
4778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v3.s}[0], [x0], x2
4788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v12.8h, v10.8h , v24.8h
4798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v27.8b, v12.8h, #5
4808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation
4818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v27.16b, v27.16b , v19.16b //Interpolation step for qpel calculation
4828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v26.s}[0], [x1], x3      //store row 4
4848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v27.s}[0], [x1], x3      // store row 5
4858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v14.8h, v0.8b, v1.8b      // temp1 = src[2_0] + src[3_0]
4868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v16.8h, v2.8b, v7.8b      // temp = src[0_0] + src[5_0]
4878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v18.8h, v3.8b, v6.8b      // temp2 = src[1_0] + src[4_0]
4888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v18.8h, v14.8h , v22.8h   // temp += temp1 * 20
4898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v4.s}[0], [x0], x2
4908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v18.8h, v16.8h , v24.8h   // temp -= temp2 * 5
4918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v8.8h, v2.8b, v1.8b
4928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v10.8h, v3.8b, v0.8b
4938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    uaddl     v12.8h, v4.8b, v7.8b
4948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v26.8b, v18.8h, #5
4958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v18.s}[0], [x7], x2      //Load value for interpolation - row 6
4968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v19.s}[0], [x7], x2      //Load value for interpolation - row 7
4978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mla       v12.8h, v8.8h , v22.8h
4988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v5.s}[0], [x0], x2
4998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mls       v12.8h, v10.8h , v24.8h
5008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sqrshrun  v27.8b, v12.8h, #5
5018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation
5028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    urhadd    v27.16b, v19.16b , v27.16b //Interpolation step for qpel calculation
5038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v26.s}[0], [x1], x3      // store row 6
5058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v27.s}[0], [x1], x3      // store row 7
5068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Send_func:
5098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
5108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldp       x19, x20, [sp], #16
5118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pop_v_regs
5128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ret
5138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
516