10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  ihevc_inter_pred_chroma_copy_w16out_neon.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  contains function definitions for inter prediction  interpolation.
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* functions are coded using neon  intrinsics and can be compiled using
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* rvct
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @author
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  yogeswaran rs
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @par list of functions:
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @remarks
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  none
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @brief
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*   chroma interprediction filter for copy
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @par description:
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    copies the array of width 'wd' and height 'ht' from the  location pointed
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    by 'src' to the location pointed by 'dst'
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] pu1_src
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  uword8 pointer to the source
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[out] pu1_dst
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  uword8 pointer to the destination
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] src_strd
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  integer source stride
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] dst_strd
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  integer destination stride
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] pi1_coeff
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  word8 pointer to the filter coefficients
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] ht
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  integer height of the array
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] wd
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  integer width of the array
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @returns
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @remarks
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  none
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                           word16 *pi2_dst,
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                           word32 src_strd,
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                           word32 dst_strd,
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                           word8 *pi1_coeff,
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                           word32 ht,
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                           word32 wd)
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@**************variables vs registers*****************************************
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r0 => *pu1_src
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r1 => *pi2_dst
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r2 =>  src_strd
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r3 =>  dst_strd
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r4 => *pi1_coeff
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r5 =>  ht
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r6 =>  wd
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
95a47cb8865a33a87f163d87781f417884d30d46edRakesh Kumar.equ    coeff_offset,   104
96a47cb8865a33a87f163d87781f417884d30d46edRakesh Kumar.equ    ht_offset,      108
97a47cb8865a33a87f163d87781f417884d30d46edRakesh Kumar.equ    wd_offset,      112
98a47cb8865a33a87f163d87781f417884d30d46edRakesh Kumar
99a47cb8865a33a87f163d87781f417884d30d46edRakesh Kumar
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.align 4
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_inter_pred_chroma_copy_w16out_a9q
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_inter_pred_chroma_copy_w16out_a9q, %function
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_inter_pred_chroma_copy_w16out_a9q:
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
113a47cb8865a33a87f163d87781f417884d30d46edRakesh Kumar    vpush        {d8 - d15}
114a47cb8865a33a87f163d87781f417884d30d46edRakesh Kumar
115a47cb8865a33a87f163d87781f417884d30d46edRakesh Kumar    ldr         r12,[sp,#wd_offset]                @loads wd
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         r12,r12,#1                  @2*wd
117a47cb8865a33a87f163d87781f417884d30d46edRakesh Kumar    ldr         r7,[sp,#ht_offset]                 @loads ht
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r7,#0                       @ht condition(ht == 0)
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ble         end_loops                   @loop
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    and         r8,r7,#3                    @check ht for mul of 2
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r9,r7,r8                    @check the rounded height value
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    and         r11,r7,#6
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r11,#6
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         loop_ht_6
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tst         r12,#7                      @conditional check for wd (multiples)
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         core_loop_wd_8
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarloop_ht_6:
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r11,r12,#4
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsls        r6,r3,#1
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#0
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         outer_loop_wd_4_ht_2
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_wd_4:
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r4,r12,#0                   @wd conditional subtract
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ble         end_inner_loop_wd_4
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarinner_loop_wd_4:
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r5,r0,r2                    @pu1_src +src_strd
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10,r1,r6
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r4,r4,#4                    @wd - 4
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,#4                    @pu1_src += 4
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1,r1,#8
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i64    q12,q12,#6                  @vshlq_n_s64(temp, 6)
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d26},[r5],r2               @vld1_u8(pu1_src_tmp)
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.64     {d24},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q13,d26                     @vmovl_u8(vld1_u8(pu1_src_tmp)
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i64    q13,q13,#6                  @vshlq_n_s64(temp, 6)
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.64     {d26},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         inner_loop_wd_4
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_inner_loop_wd_4:
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r9,r9,#4                    @ht - 4
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r0,r5,r11
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r1,r10,r11,lsl #1
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         outer_loop_wd_4
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r8,#0
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         outer_loop_wd_4_ht_2
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_loops:
172a47cb8865a33a87f163d87781f417884d30d46edRakesh Kumar    vpop         {d8 - d15}
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_wd_4_ht_2:
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r4,r12,#0                   @wd conditional subtract
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ble         end_inner_loop_wd_4
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarinner_loop_wd_4_ht_2:
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r5,r0,r2                    @pu1_src +src_strd
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10,r1,r6
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r4,r4,#4                    @wd - 4
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,#4                    @pu1_src += 4
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1,r1,#8
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         inner_loop_wd_4_ht_2
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    b           end_loops
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_wd_8:
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @sub            r11,r12,#8
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsls        r5,r3,#1
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r11,r12,r3, lsl #2          @ r11 = (dst_strd * 4) - width
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r8,r12,r2,lsl #2            @r2->src_strd
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r4,r12, lsr #3              @ divide by 8
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r7,r9
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mul         r7, r4
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r4,r12,#0                   @wd conditional check
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r7,r7,#4                    @subtract one for epilog
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#0
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         core_loop_wd_8_ht_2
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarprolog:
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r6,r0,r2                    @pu1_src_tmp += src_strd
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10,r1,r5
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r4,r4,#8                    @wd decrements by 8
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addle       r0,r0,r8
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r6,r0,r2                    @pu1_src_tmp += src_strd
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addle       r1,r1,r11,lsl #1
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    suble       r4,r12,#0                   @wd conditional check
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r7,r7,#4                    @ht - 4
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    blt         epilog_end                  @jumps to epilog_end
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         epilog                      @jumps to epilog
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_wd_8:
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r4,r4,#8                    @wd decrements by 8
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addle       r0,r0,r8
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r6,r0,r2                    @pu1_src_tmp += src_strd
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10,r1,r5
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addle       r1,r1,r11,lsl #1
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    suble       r4,r12,#0                   @wd conditional check
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r7,r7,#4                    @ht - 4
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         outer_loop_wd_8
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog:
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @add        r6,r0,r2                @pu1_src_tmp += src_strd
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10,r1,r5
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog_end:
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    b           end_loops
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_wd_8_ht_2:
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r6,r0,r2                    @pu1_src_tmp += src_strd
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10,r1,r5
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r12,r12,#8                  @wd decrements by 8
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop_wd_8_ht_2
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
327a47cb8865a33a87f163d87781f417884d30d46edRakesh Kumar    vpop         {d8 - d15}
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
335