10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///***************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*****************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///** 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************* 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //brief 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* interprediction luma function for copy 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //par description: 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* copies the array of width 'wd' and height 'ht' from the location pointed 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* by 'src' to the location pointed by 'dst' 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] pu1_src 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* uword8 pointer to the source 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[out] pu1_dst 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* uword8 pointer to the destination 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] src_strd 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* integer source stride 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] dst_strd 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* integer destination stride 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] pi1_coeff 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* word8 pointer to the filter coefficients 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] ht 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* integer height of the array 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] wd 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* integer width of the array 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //returns 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //remarks 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* none 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************* 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/ 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//void ihevc_inter_pred_luma_copy_w16out ( 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// uword8 *pu1_src, 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// word16 *pi2_dst, 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// word32 src_strd, 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// word32 dst_strd, 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// word8 *pi1_coeff, 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// word32 ht, 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// word32 wd ) 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//**************variables vs registers***************************************** 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// x0 => *pu1_src 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// x1 => *pi2_dst 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// x2 => src_strd 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// x3 => dst_strd 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// x7 => ht 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// x12 => wd 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.align 4 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.include "ihevc_neon_macros.s" 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_inter_pred_luma_copy_w16out_av8 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_inter_pred_luma_copy_w16out_av8, %function 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_inter_pred_luma_copy_w16out_av8: 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 879cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar stp x19, x20,[sp,#-16]! 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x15,x4 // pi1_coeff 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x16,x5 // ht 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x17,x6 // wd 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x12,x17 //loads wd 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x7,x16 //loads ht 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmp x7,#0 //ht condition(ht == 0) 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ble end_loops //loop 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tst x12,#7 //conditional check for wd (multiples) 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar beq core_loop_wd_8 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x11,x12,#4 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lsl x6, x3,#1 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar adds x6, x6,#0 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_wd_4: 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar subs x4,x12,#0 //wd conditional subtract 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ble end_inner_loop_wd_4 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarinner_loop_wd_4: 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x5,x0,x2 //pu1_src +src_strd 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x10,x1,x6 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar subs x4,x4,#4 //wd - 4 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x0,x0,#4 //pu1_src += 4 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v0.d}[0],[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x1,x1,#8 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v22.d}[0],[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v24.2d, v24.2d,#6 //vshlq_n_s64(temp, 6) 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ld1 {v26.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v24.d}[0],[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar uxtl v26.8h, v26.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v26.2d, v26.2d,#6 //vshlq_n_s64(temp, 6) 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v26.d}[0],[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar bgt inner_loop_wd_4 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_inner_loop_wd_4: 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar subs x7,x7,#4 //ht + 4 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x0,x5,x11 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x1,x10,x11,lsl #1 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar bgt outer_loop_wd_4 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_loops: 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x19, x20,[sp], #16 1419cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ret 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_wd_8: 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //sub x11,x12,#8 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lsl x5, x3,#1 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar adds x5, x5,#0 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x20,x12,x3, lsl #2 // x11 = (dst_strd * 4) - width 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar neg x11, x20 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x20,x12,x2,lsl #2 //x2->src_strd 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar neg x8, x20 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lsr x4, x12, #3 // divide by 8 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mul x7, x7, x4 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x4,x12,#0 //wd conditional check 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x7,x7,#4 //subtract one for epilog 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarprolog: 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x6,x0,x2 //pu1_src_tmp += src_strd 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x10,x1,x5 1629cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 1639cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 1649cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 1659cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 1669cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 1679cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 1689cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 1699cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar subs x4,x4,#8 //wd decrements by 8 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20,x0,x8 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x0, x20, x0,le 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x6,x0,x2 //pu1_src_tmp += src_strd 1789cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 1799cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 1809cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 1819cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20,x1,x11,lsl #1 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x1, x20, x1,le 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x20,x12,#0 //wd conditional check 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x4, x20, x4,le 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar subs x7,x7,#4 //ht - 4 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar blt epilog_end //jumps to epilog_end 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar beq epilog //jumps to epilog 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_wd_8: 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 1999cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 2029cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 2059cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2079cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar subs x4,x4,#8 //wd decrements by 8 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20,x0,x8 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x0, x20, x0,le 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x6,x0,x2 //pu1_src_tmp += src_strd 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2159cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2189cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2219cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2249cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x10,x1,x5 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x20,x1,x11,lsl #1 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x1, x20, x1,le 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar sub x20,x12,#0 //wd conditional check 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x4, x20, x4,le 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar subs x7,x7,#4 //ht - 4 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar bgt outer_loop_wd_8 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog: 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 2419cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 2449cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 2479cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2499cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //add x6,x0,x2 //pu1_src_tmp += src_strd 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add x10,x1,x5 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog_end: 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x19, x20,[sp], #16 2679cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ret 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 273