10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///**
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //brief
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*     interprediction luma function for copy
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //par description:
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*   copies the array of width 'wd' and height 'ht' from the  location pointed
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*   by 'src' to the location pointed by 'dst'
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] pu1_src
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  uword8 pointer to the source
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[out] pu1_dst
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  uword8 pointer to the destination
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] src_strd
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer source stride
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] dst_strd
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer destination stride
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] pi1_coeff
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  word8 pointer to the filter coefficients
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] ht
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer height of the array
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] wd
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer width of the array
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //returns
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //remarks
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  none
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//void ihevc_inter_pred_luma_copy (
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                            uword8 *pu1_src,
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                            uword8 *pu1_dst,
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                            word32 src_strd,
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                            word32 dst_strd,
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                            word8 *pi1_coeff,
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                            word32 ht,
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                            word32 wd   )
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//**************variables vs registers*****************************************
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    x0 => *pu1_src
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    x1 => *pu1_dst
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    x2 =>  src_strd
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    x3 =>  dst_strd
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    x11 =>  ht
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    x16 => wd
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.align 4
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.include "ihevc_neon_macros.s"
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_inter_pred_luma_copy_av8
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_inter_pred_luma_copy_av8, %function
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_inter_pred_luma_copy_av8:
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // stmfd sp!, {x8-x16, lr}                //stack stores the values of the arguments
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stp         x19,x20,[sp, #-16]!
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x16,x6                      //loads wd
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x11,x5                      //loads ht
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         x11,#0                      //checks ht == 0
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ble         end_loops
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tst         x16,#15                     //checks wd for multiples for 4 & 8
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         core_loop_wd_16
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tst         x16,#7                      //checks wd for multiples for 4 & 8
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         core_loop_wd_8
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x15,x16,#4
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_wd_4:
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x8,x16,#0                   //checks wd == 0
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ble         end_inner_loop_wd_4
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarinner_loop_wd_4:
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x9,x0,x2                    //pu1_src_tmp += src_strd
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x0,x0,#4                    //pu1_src += 4
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x8,x8,#4                    //(wd -4)
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x1,x1,#4                    //pu1_dst += 4
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         inner_loop_wd_4
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_inner_loop_wd_4:
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x11,x11,#4                  //ht - 4
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         outer_loop_wd_4
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_loops:
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  MRS x20,PMCCFILTR_EL0
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x0,x20,x19
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldp         x19,x20,[sp],#16
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ret
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_wd_8:
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x15,x16,#8
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_wd_8:
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x8,x16,#0                   //checks wd
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ble         end_inner_loop_wd_8
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarinner_loop_wd_8:
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x9,x0,x2                    //pu1_src_tmp += src_strd
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v1.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v1.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x8,x8,#8                    //wd - 8(loop condition)
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v2.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v2.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v3.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v3.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         inner_loop_wd_8
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_inner_loop_wd_8:
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x11,x11,#4                  //ht -= 4
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         outer_loop_wd_8
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  MRS x20,PMCCFILTR_EL0
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x0,x20,x19
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldp         x19,x20,[sp],#16
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ret
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_wd_16:
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x15,x16,#16
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_wd_16:
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x8,x16,#0                   //checks wd
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ble         end_inner_loop_wd_16
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarinner_loop_wd_16:
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x9,x0,x2                    //pu1_src_tmp += src_strd
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v1.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v1.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x8,x8,#16                   //wd - 8(loop condition)
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v2.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v2.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v3.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v3.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         inner_loop_wd_16
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_inner_loop_wd_16:
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x11,x11,#4                  //ht -= 4
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         outer_loop_wd_16
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  MRS x20,PMCCFILTR_EL0
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x0,x20,x19
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldp         x19,x20,[sp],#16
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ret
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
200