18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//****************************************************************************** 28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Copyright (C) 2015 The Android Open Source Project 48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Licensed under the Apache License, Version 2.0 (the "License"); 68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* you may not use this file except in compliance with the License. 78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* You may obtain a copy of the License at: 88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* http://www.apache.org/licenses/LICENSE-2.0 108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Unless required by applicable law or agreed to in writing, software 128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* distributed under the License is distributed on an "AS IS" BASIS, 138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* See the License for the specific language governing permissions and 158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* limitations under the License. 168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//***************************************************************************** 188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*/ 208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S///** 218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//****************************************************************************** 228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @file 238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* ih264_inter_pred_luma_vert_qpel_av8.s 248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @brief 268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Contains function definitions for inter prediction vertical quarter pel interpolation. 278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @author 298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Mohit 308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @par List of Functions: 328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* - ih264_inter_pred_luma_vert_qpel_av8() 348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @remarks 368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* None 378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//******************************************************************************* 398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*/ 408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S///* All the functions here are replicated from ih264_inter_pred_filters.c 428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// 438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S///** 458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S///** 468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//******************************************************************************* 478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @brief 498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Quarter pel interprediction luma filter for vertical input 508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @par Description: 528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits 538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* sec 8.4.2.2.1 titled "Luma sample interpolation process" 548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] pu1_src 568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* UWORD8 pointer to the source 578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[out] pu1_dst 598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* UWORD8 pointer to the destination 608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] src_strd 628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* integer source stride 638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] dst_strd 658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* integer destination stride 668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] ht 688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* integer height of the array 698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] wd 718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* integer width of the array 728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] pu1_tmp: temporary buffer: UNUSED in this function 748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] dydx: x and y reference offset for qpel calculations. 768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @returns 778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// @remarks 798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* None 808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* 818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//******************************************************************************* 828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*/ 838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//void ih264_inter_pred_luma_vert ( 858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// UWORD8 *pu1_src, 868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// UWORD8 *pu1_dst, 878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// WORD32 src_strd, 888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// WORD32 dst_strd, 898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// WORD32 ht, 908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// WORD32 wd, 918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// UWORD8* pu1_tmp, 928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// UWORD32 dydx) 938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//**************Variables Vs Registers***************************************** 958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// x0 => *pu1_src 968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// x1 => *pu1_dst 97d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo// w2 => src_strd 98d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo// w3 => dst_strd 99d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo// w4 => ht 100d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo// w5 => wd 101d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo// w7 => dydx 1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.text 1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.p2align 2 1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.include "ih264_neon_macros.s" 1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S .global ih264_inter_pred_luma_vert_qpel_av8 1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_inter_pred_luma_vert_qpel_av8: 1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S push_v_regs 1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S stp x19, x20, [sp, #-16]! 115d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo sxtw x2, w2 116d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo sxtw x3, w3 117d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo sxtw x4, w4 118d91f49ad65795b8d3223f1aba481bf3931b291e6Martin Storsjo sxtw x5, w5 1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S and x7, x7, #12 //Finds y-offset 1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lsr x7, x7, #3 //dydx>>3 1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mul x7, x2, x7 1248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S add x7, x0, x7 //pu1_src + (y_offset>>1)*src_strd 1258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sub x14, x4, #16 1268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S movi v22.8h, #20 // Filter coeff 0x14 into Q11 1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd 1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S subs x12, x5, #8 //if wd=8 branch to loop_8 1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S movi v24.8h, #5 // Filter coeff 0x4 into Q12 1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S beq loop_8_start 1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S subs x12, x5, #4 //if wd=4 branch to loop_4 1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S beq loop_4_start 1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] 1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] 1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] 1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] 1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S add x14, x14, #1 //for checking loop 1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] 1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] 1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] 1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sloop_16: //when wd=16 1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0] 1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0] 1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v20.8h, v1.8b, v11.8b // temp4 = src[0_8] + src[5_8] 1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v5.8b, v7.8b // temp3 = src[2_8] + src[3_8] 1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v0.2s, v1.2s}, [x0], x2 1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v26.8h, v3.8b, v9.8b // temp5 = src[1_8] + src[4_8] 1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v6.8b, v8.8b 1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v2.8b, v0.8b 1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v4.8b, v10.8b 1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v16.8h, v12.8h , v22.8h 1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v26.8h, v5.8b, v11.8b 1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v7.8b, v9.8b 1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 1648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v3.8b, v1.8b 1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v2.2s, v3.2s}, [x0], x2 1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v14.8h, v12.8h , v22.8h 1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v16.8h, v18.8h , v24.8h 1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 0 1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value 1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value 1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v4.8b, v2.8b 1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v8.8b, v10.8b 1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0] 1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v18.8h, v12.8h , v22.8h 1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v20.8h, v6.8b, v0.8b 1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v14.8h, v26.8h , v24.8h 1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v30.8b, v16.8h, #5 1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v9.8b, v11.8b 1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v5.8b, v3.8b 1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v26.8h, v7.8b, v1.8b 1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v16.8h, v12.8h , v22.8h 1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v18.8h, v20.8h , v24.8h 1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v4.2s, v5.2s}, [x0], x2 1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v31.8b, v14.8h, #5 1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 1 1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v10.8b, v0.8b 1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value 1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value 1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v6.8b, v4.8b 1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v20.8h, v8.8b, v2.8b 1928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v14.8h, v12.8h , v22.8h 1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v16.8h, v26.8h , v24.8h 1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v30.2s, v31.2s}, [x1], x3 //store row 1 1958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v30.8b, v18.8h, #5 1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v7.8b, v5.8b 1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v11.8b, v1.8b 1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v18.8h, v12.8h , v22.8h 1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v26.8h, v9.8b, v3.8b 2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v14.8h, v20.8h , v24.8h 2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v6.2s, v7.2s}, [x0], x2 2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v31.8b, v16.8h, #5 2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v16.2s, v17.2s}, [x7], x2 // Load for interpolation row 2 2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v18.8h, v26.8h , v24.8h 2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v30.16b, v16.16b , v30.16b // Interpolation to obtain qpel value 2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v31.16b, v17.16b , v31.16b // Interpolation to obtain qpel value 2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v0.8b, v2.8b // temp1 = src[2_0] + src[3_0] 2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v30.2s, v31.2s}, [x1], x3 //store row 2 2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v10.8b, v4.8b // temp2 = src[1_0] + src[4_0] 2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v20.8h, v9.8b, v7.8b // temp4 = src[0_8] + src[5_8] 2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v30.8b, v14.8h, #5 2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v26.8h, v5.8b, v11.8b // temp5 = src[1_8] + src[4_8] 2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v8.8b, v6.8b // temp = src[0_0] + src[5_0] 2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v31.8b, v18.8h, #5 2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v18.2s, v19.2s}, [x7], x2 // Load for interpolation row 3 2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v30.16b, v18.16b , v30.16b // Interpolation to obtain qpel value 2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v31.16b, v19.16b , v31.16b // Interpolation to obtain qpel value 2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v1.8b, v3.8b // temp3 = src[2_8] + src[3_8] 2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v30.2s, v31.2s}, [x1], x3 //store row 3 2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // 4 rows processed 2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v8.2s, v9.2s}, [x0], x2 2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v2.8b, v4.8b 2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v3.8b, v5.8b 2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v28.8h, v9.8b, v11.8b 2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v6.8b, v0.8b 2298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v28.8h, v18.8h , v22.8h // temp4 += temp3 * 20 2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v26.8h, v1.8b, v7.8b 2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v5.8b, v7.8b 2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v8.8b, v10.8b 2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 4 2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v10.2s, v11.2s}, [x0], x2 2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value 2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value 2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v28.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v30.2s, v31.2s}, [x1], x3 // store row 4 2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v20.8h, v11.8b, v1.8b 2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v26.8h, v3.8b, v9.8b 2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v6.8b, v4.8b 2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v7.8b, v9.8b 2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v31.8b, v28.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v8.8b, v2.8b 2518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 2528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 5 2538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 2548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value 2558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value 2568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v10.8b, v0.8b 2578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v30.2s, v31.2s}, [x1], x3 // store row 5 2588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 2598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v0.2s, v1.2s}, [x0], x2 2608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v26.8h, v5.8b, v11.8b 2618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v8.8b, v6.8b 2628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v28.8h, v0.8b, v2.8b 2638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 2648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v28.8h, v12.8h , v22.8h // temp += temp1 * 20 2658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v20.8h, v1.8b, v3.8b 2668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 2678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 2688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v10.8b, v4.8b 2698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 2708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 6 2718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mov v2.8b, v6.8b 2728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mov v3.8b, v7.8b 2738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value 2748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value 2758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v28.8h, v16.8h , v24.8h // temp -= temp2 * 5 2778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v30.2s, v31.2s}, [x1], x3 // store row 6 2788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v30.8b, v28.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 2798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S swp v0.8b, v4.8b // swapping registers to put it in order 2808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S swp v1.8b, v5.8b // swapping registers to put it in order 2818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 2838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mov v6.8b, v10.8b 2848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mov v7.8b, v11.8b 2858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S subs x12, x14, #1 // if height==16 - looping 2868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S swp v4.8b, v8.8b 2878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S swp v5.8b, v9.8b 2888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 2898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 7 2908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value 2918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value 2928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v30.2s, v31.2s}, [x1], x3 // store row 7 2938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bne end_func //if height =8 end function 2948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S add x14, x14, #1 //for checking loop 2958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v10.2s, v11.2s}, [x0], x2 2968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] 2978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S b loop_16 // looping if height =16 2998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sloop_8_start: 3018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//// Processing row0 and row1 3028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v0.2s}, [x0], x2 // Vector load from src[0_0] 3048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v1.2s}, [x0], x2 // Vector load from src[1_0] 3058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v2.2s}, [x0], x2 // Vector load from src[2_0] 3068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v3.2s}, [x0], x2 // Vector load from src[3_0] 3078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S add x14, x14, #1 //for checking loop 3088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v4.2s}, [x0], x2 // Vector load from src[4_0] 3098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v5.2s}, [x0], x2 // Vector load from src[5_0] 3108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sloop_8: 3128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //for checking loop 3138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0] 3148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] 3158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] 3168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20 3178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v6.2s}, [x0], x2 3188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v3.8b, v4.8b 3198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v1.8b, v6.8b 3208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v2.8b, v5.8b 3218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5 3228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v16.8h, v14.8h , v22.8h 3238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v7.2s}, [x0], x2 3248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v20.8h, v4.8b, v5.8b 3258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v2.8b, v7.8b 3268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v10.8h, v3.8b, v6.8b 3278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v16.8h, v18.8h , v24.8h 3288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5) 3298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v12.8h, v20.8h , v22.8h 3308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v8.2s}, [x7], x2 //Load value for interpolation (row0) 3318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v9.2s}, [x7], x2 //Load value for interpolation (row1) 3328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v0.2s}, [x0], x2 3338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v5.8b, v6.8b 3348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v27.8b, v16.8h, #5 3358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v26.16b, v8.16b , v26.16b // Interpolation step for qpel calculation 3368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v27.16b, v9.16b , v27.16b // Interpolation step for qpel calculation 3378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v20.8h, v3.8b, v0.8b 3398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v12.8h, v10.8h , v24.8h 3408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v26.2s}, [x1], x3 // Vector store to dst[0_0] 3418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v4.8b, v7.8b 3428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v20.8h, v14.8h , v22.8h 3438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v27.2s}, [x1], x3 // Vector store to dst[1_0] 3448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v28.8b, v12.8h, #5 3458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v20.8h, v18.8h , v24.8h 3468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v12.2s}, [x7], x2 //Load value for interpolation (row2) 3478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v13.2s}, [x7], x2 //Load value for interpolation (row3) 3488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v1.2s}, [x0], x2 3498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v29.8b, v20.8h, #5 3508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S subs x9, x4, #4 3518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v28.16b, v12.16b , v28.16b 3528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v29.16b, v13.16b , v29.16b 3538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v28.2s}, [x1], x3 //store row 2 3548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v29.2s}, [x1], x3 //store row 3 3558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S beq end_func // Branch if height==4 3568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0] 3578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] 3588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] 3598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 3608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v2.2s}, [x0], x2 3618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 3628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v8.8h, v0.8b, v7.8b 3638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v10.8h, v1.8b, v6.8b 3648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v2.8b, v5.8b 3658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v26.8b, v18.8h, #5 3668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v12.8h, v8.8h , v22.8h 3678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v18.2s}, [x7], x2 //Load value for interpolation (row4) 3688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v19.2s}, [x7], x2 //Load value for interpolation (row5) 3698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v3.2s}, [x0], x2 3708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v12.8h, v10.8h , v24.8h 3718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v27.8b, v12.8h, #5 3728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation 3738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation 3748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v26.2s}, [x1], x3 // store row 4 3768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v27.2s}, [x1], x3 // store row 5 3778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0] 3788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0] 3798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0] 3808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 3818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v4.2s}, [x0], x2 3828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 3838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v8.8h, v2.8b, v1.8b 3848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v10.8h, v3.8b, v0.8b 3858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v4.8b, v7.8b 3868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v26.8b, v18.8h, #5 3878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v12.8h, v8.8h , v22.8h 3888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v18.2s}, [x7], x2 //Load value for interpolation (row6) 3898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v19.2s}, [x7], x2 //Load value for interpolation (row7) 3908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v5.2s}, [x0], x2 3918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v12.8h, v10.8h , v24.8h 3928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v27.8b, v12.8h, #5 3938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation 3948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation 3958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S subs x12, x14, #1 3978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v26.2s}, [x1], x3 // store row 6 3988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v27.2s}, [x1], x3 // store row 7 3998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S add x14, x14, #1 4008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S beq loop_8 //looping if height ==16 4018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S b end_func 4038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sloop_4_start: 4068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//// Processing row0 and row1 4078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v0.s}[0], [x0], x2 // Vector load from src[0_0] 4108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v1.s}[0], [x0], x2 // Vector load from src[1_0] 4118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v2.s}[0], [x0], x2 // Vector load from src[2_0] 4128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v3.s}[0], [x0], x2 // Vector load from src[3_0] 4138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v4.s}[0], [x0], x2 // Vector load from src[4_0] 4148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v5.s}[0], [x0], x2 // Vector load from src[5_0] 4158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0] 4178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] 4188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] 4198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20 4208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v6.2s}, [x0], x2 4218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v3.8b, v4.8b 4228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v1.8b, v6.8b 4238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v2.8b, v5.8b 4248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5 4258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v7.s}[0], [x0], x2 4268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v16.8h, v14.8h , v22.8h 4278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v20.8h, v4.8b, v5.8b 4288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v2.8b, v7.8b 4298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v10.8h, v3.8b, v6.8b 4308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v16.8h, v18.8h , v24.8h 4318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5) 4328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v8.s}[0], [x7], x2 //Load value for interpolation - row 0 4338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v9.s}[0], [x7], x2 //Load value for interpolation - row 1 4348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v12.8h, v20.8h , v22.8h 4358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v0.s}[0], [x0], x2 4368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v5.8b, v6.8b 4378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v27.8b, v16.8h, #5 4388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v20.8h, v3.8b, v0.8b 4398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v26.16b, v26.16b , v8.16b //Interpolation step for qpel calculation 4408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v27.16b, v27.16b , v9.16b //Interpolation step for qpel calculation 4418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v12.8h, v10.8h , v24.8h 4438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v26.s}[0], [x1], x3 // Vector store to dst[0_0] 4448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v4.8b, v7.8b 4458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v20.8h, v14.8h , v22.8h 4468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v27.s}[0], [x1], x3 // store row 1 4478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v28.8b, v12.8h, #5 4488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v12.s}[0], [x7], x2 //Load value for interpolation - row 2 4498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v13.s}[0], [x7], x2 //Load value for interpolation - row 3 4508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v20.8h, v18.8h , v24.8h 4528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v1.s}[0], [x0], x2 4538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v29.8b, v20.8h, #5 4548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v28.16b, v12.16b , v28.16b //Interpolation step for qpel calculation 4558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v29.16b, v13.16b , v29.16b //Interpolation step for qpel calculation 4568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v28.s}[0], [x1], x3 //store row 2 4588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v29.s}[0], [x1], x3 //store row 3 4598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S subs x9, x4, #4 4618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S beq end_func // Branch if height==4 4628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0] 4658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] 4668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] 4678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 4688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v2.s}[0], [x0], x2 4698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 4708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v8.8h, v0.8b, v7.8b 4718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v10.8h, v1.8b, v6.8b 4728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v2.8b, v5.8b 4738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v26.8b, v18.8h, #5 4748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v18.s}[0], [x7], x2 //Load value for interpolation - row 4 4758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v19.s}[0], [x7], x2 //Load value for interpolation - row 5 4768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v12.8h, v8.8h , v22.8h 4778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v3.s}[0], [x0], x2 4788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v12.8h, v10.8h , v24.8h 4798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v27.8b, v12.8h, #5 4808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation 4818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v27.16b, v27.16b , v19.16b //Interpolation step for qpel calculation 4828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v26.s}[0], [x1], x3 //store row 4 4848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v27.s}[0], [x1], x3 // store row 5 4858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0] 4868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0] 4878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0] 4888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 4898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v4.s}[0], [x0], x2 4908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 4918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v8.8h, v2.8b, v1.8b 4928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v10.8h, v3.8b, v0.8b 4938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S uaddl v12.8h, v4.8b, v7.8b 4948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v26.8b, v18.8h, #5 4958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v18.s}[0], [x7], x2 //Load value for interpolation - row 6 4968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v19.s}[0], [x7], x2 //Load value for interpolation - row 7 4978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mla v12.8h, v8.8h , v22.8h 4988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ld1 {v5.s}[0], [x0], x2 4998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mls v12.8h, v10.8h , v24.8h 5008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sqrshrun v27.8b, v12.8h, #5 5018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation 5028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S urhadd v27.16b, v19.16b , v27.16b //Interpolation step for qpel calculation 5038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v26.s}[0], [x1], x3 // store row 6 5058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S st1 {v27.s}[0], [x1], x3 // store row 7 5068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Send_func: 5098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack 5108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldp x19, x20, [sp], #16 5118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pop_v_regs 5128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ret 5138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 516