10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///**
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //brief
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*     interprediction luma function for copy
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //par description:
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*   copies the array of width 'wd' and height 'ht' from the  location pointed
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*   by 'src' to the location pointed by 'dst'
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] pu1_src
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  uword8 pointer to the source
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[out] pu1_dst
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  uword8 pointer to the destination
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] src_strd
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer source stride
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] dst_strd
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer destination stride
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] pi1_coeff
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  word8 pointer to the filter coefficients
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] ht
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer height of the array
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //param[in] wd
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  integer width of the array
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //returns
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* //remarks
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  none
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//void ihevc_inter_pred_luma_copy_w16out (
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                uword8 *pu1_src,
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                word16 *pi2_dst,
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                word32 src_strd,
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                word32 dst_strd,
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                word8 *pi1_coeff,
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                word32 ht,
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                                word32 wd   )
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//**************variables vs registers*****************************************
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    x0 => *pu1_src
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    x1 => *pi2_dst
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    x2 =>  src_strd
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    x3 =>  dst_strd
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    x7 =>  ht
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//    x12 => wd
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.align 4
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.include "ihevc_neon_macros.s"
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_inter_pred_luma_copy_w16out_av8
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_inter_pred_luma_copy_w16out_av8, %function
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_inter_pred_luma_copy_w16out_av8:
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
879cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stp         x19, x20,[sp,#-16]!
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x15,x4 // pi1_coeff
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x16,x5 // ht
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x17,x6 // wd
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x12,x17                     //loads wd
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x7,x16                      //loads ht
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         x7,#0                       //ht condition(ht == 0)
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ble         end_loops                   //loop
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tst         x12,#7                      //conditional check for wd (multiples)
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         core_loop_wd_8
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x11,x12,#4
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         x6, x3,#1
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    adds        x6, x6,#0
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_wd_4:
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x4,x12,#0                   //wd conditional subtract
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ble         end_inner_loop_wd_4
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarinner_loop_wd_4:
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v0.8b},[x0]                //vld1_u8(pu1_src_tmp)
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x5,x0,x2                    //pu1_src +src_strd
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uxtl        v0.8h, v0.8b                //vmovl_u8(vld1_u8(pu1_src_tmp)
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x10,x1,x6
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x4,x4,#4                    //wd - 4
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v0.2d, v0.2d,#6             //vshlq_n_s64(temp, 6)
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v22.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x0,x0,#4                    //pu1_src += 4
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v0.d}[0],[x1]              //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x1,x1,#8
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uxtl        v22.8h, v22.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v24.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v22.2d, v22.2d,#6           //vshlq_n_s64(temp, 6)
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uxtl        v24.8h, v24.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v22.d}[0],[x10],x6         //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v24.2d, v24.2d,#6           //vshlq_n_s64(temp, 6)
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ld1         {v26.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v24.d}[0],[x10],x6         //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    uxtl        v26.8h, v26.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v26.2d, v26.2d,#6           //vshlq_n_s64(temp, 6)
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v26.d}[0],[x10],x6         //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         inner_loop_wd_4
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_inner_loop_wd_4:
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x7,x7,#4                    //ht + 4
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x0,x5,x11
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x1,x10,x11,lsl #1
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         outer_loop_wd_4
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_loops:
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldp         x19, x20,[sp], #16
1419cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ret
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_wd_8:
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //sub            x11,x12,#8
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         x5, x3,#1
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    adds        x5, x5,#0
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x20,x12,x3, lsl #2          // x11 = (dst_strd * 4) - width
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    neg         x11, x20
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x20,x12,x2,lsl #2           //x2->src_strd
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    neg         x8, x20
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsr         x4, x12, #3                 // divide by 8
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mul         x7, x7, x4
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x4,x12,#0                   //wd conditional check
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x7,x7,#4                    //subtract one for epilog
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarprolog:
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x6,x0,x2                    //pu1_src_tmp += src_strd
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x10,x1,x5
1629cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
1639cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
1649cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
1659cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
1669cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
1679cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
1689cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
1699cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x4,x4,#8                    //wd decrements by 8
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20,x0,x8
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x0, x20, x0,le
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x6,x0,x2                    //pu1_src_tmp += src_strd
1789cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
1799cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
1809cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
1819cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20,x1,x11,lsl #1
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x1, x20, x1,le
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x20,x12,#0                  //wd conditional check
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x4, x20, x4,le
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x7,x7,#4                    //ht - 4
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    blt         epilog_end                  //jumps to epilog_end
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         epilog                      //jumps to epilog
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_wd_8:
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
1999cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
2029cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
2059cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2079cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x4,x4,#8                    //wd decrements by 8
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20,x0,x8
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x0, x20, x0,le
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x6,x0,x2                    //pu1_src_tmp += src_strd
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2159cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2189cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2219cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2249cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x10,x1,x5
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x20,x1,x11,lsl #1
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x1, x20, x1,le
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x20,x12,#0                  //wd conditional check
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x4, x20, x4,le
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        x7,x7,#4                    //ht - 4
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         outer_loop_wd_8
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog:
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
2419cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
2449cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
2479cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2499cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //add          x6,x0,x2                //pu1_src_tmp += src_strd
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         x10,x1,x5
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog_end:
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldp         x19, x20,[sp], #16
2679cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ret
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
273