10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  ihevc_weighted_pred_bi_default.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  contains function definitions for weighted prediction used in inter
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* prediction
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @author
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  parthiban v
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @par list of functions:
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  - ihevc_weighted_pred_bi_default()
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @remarks
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  none
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @brief
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  does default bi-weighted prediction on the arrays pointed by pi2_src1 and
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* pi2_src2 and stores it at location  pointed by pi2_dst assumptions : the
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* function is optimized considering the fact width and  height are multiple
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* of 2.
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @par description:
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* >> shift  where shift = 15 - bitdepth
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] pi2_src1
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  pointer to source 1
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] pi2_src2
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  pointer to source 2
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[out] pu1_dst
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  pointer to destination
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] src_strd1
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  source stride 1
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] src_strd2
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  source stride 2
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] dst_strd
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  destination stride
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] lvl_shift1
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  added before shift and offset
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] lvl_shift2
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  added before shift and offset
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] ht
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  height of the source
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] wd
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  width of the source
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @returns
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @remarks
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  none
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word16 *pi2_src2,
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    uword8 *pu1_dst,
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word32 src_strd1,
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word32 src_strd2,
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word32 dst_strd,
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word32 lvl_shift1,
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word32 lvl_shift2,
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word32 ht,
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word32 wd)
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@**************variables vs registers*****************************************
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r0 => *pi2_src1
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r1 => *pi2_src2
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r2 => *pu1_dst
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r3 =>  src_strd1
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r4 =>  src_strd2
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r5 =>  dst_strd
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r6 =>  lvl_shift1
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r7 =>  lvl_shift2
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r8 =>  ht
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r9 =>  wd
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.align 4
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_weighted_pred_bi_default_a9q
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_weighted_pred_bi_default_a9q, %function
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_weighted_pred_bi_default_a9q:
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r4,[sp,#40]                 @load src_strd2
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         r3,r3,#1
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r5,[sp,#44]                 @load dst_strd
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r6,[sp,#48]                 @load lvl_shift1
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         r4,r4,#1
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r7,[sp,#52]                 @load lvl_shift2
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r8,[sp,#56]                 @load ht
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r9,[sp,#60]                 @load wd
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.16     q2,r6                       @lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.16     q3,r7                       @lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.i16    q0,#0x40                    @tmp_lvl_shift = 1 << (shift - 1)
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.i16    q2,q3
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.s16    q0,q0,q2
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   vmvn.i32    q1,#0x6                         @vmovq_n_s32(tmp_shift)
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         r6,r9,#1
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r7,r6,r3,lsl #2             @4*src_strd1 - wd
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r10,r6,r4,lsl #2            @4*src_strd2 - wd
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @asr            r6,#1
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @rsb            r6,r6,r5,lsl #2             @4*dst_strd - wd
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r8,#0                       @check ht == 0
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_loops                   @if equal, then end the function
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarchroma_decision:
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    orr         r14,r8,r9
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r14,#10
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         outer_loop_chroma_8x2
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r14,#6
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         outer_loop_chroma_4x2
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarluma_decision:
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#24
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         outer_loop_8
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#16
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bge         outer_loop_16
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#12
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         outer_loop_4
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#8
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bge         outer_loop_8
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_4:
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#0                       @check wd == 0
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_loops                   @if equal, then end the function
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_4:
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   d18,d6,d7
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d20,q9,#7
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d22},[r11],r3              @load and increment the pi2_src1 iii iteration
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d23},[r12],r4              @load and increment the pi2_src2 iii iteration
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   d30,d22,d23
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   d30,d30,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d24},[r11],r3              @load and increment the pi2_src1 iv iteration
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d25},[r12],r4              @load and increment the pi2_src2 iv iteration
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   d18,d24,d25                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   d31,d18,d0
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d30,q15,#7
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d30[0]},[r14],r5           @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d30[1]},[r14],r5           @store pu1_dst iv iteration
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop_4                 @if greater than 0 repeat the core loop again
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_core_loop_4:
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r8,r8,#4                    @decrement the ht by 4
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    asr         r9,r6,#1
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r2,r2,r14
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            @pu1_dst + dst_std - wd
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop_4                 @if ht is greater than 0 goto outer_loop
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    b           end_loops
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ this is only for chroma module with input 2x2
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_chroma_4x2:
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#0                       @check wd == 0
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_loops                   @if equal, then end the function
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_chroma_4x2:
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   d18,d6,d7
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d20,q9,#7
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop_chroma_4x2        @if greater than 0 repeat the core loop again
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_core_loop_chorma_4x2:
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r8,r8,#2                    @decrement the ht by 4
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,r7                    @pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    asr         r9,r6,#1
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1,r1,r10                   @pi2_src2 + 2*src_strd2 - 2*wd
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r2,r2,r14
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            @pu1_dst + dst_std - wd
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop_chroma_4x2        @if ht is greater than 0 goto outer_loop
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    b           end_loops
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_8:
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#0                       @check wd == 0
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_loops                   @if equal, then end the function
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_8:
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q12,q12,q13
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q9},[r12],r4               @load and increment the pi2_src2 iii iteration
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d20,q12,#7
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q6},[r11],r3               @load and increment the pi2_src1 iv iteration
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q15,q8,q9
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d21,q11,#7
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q7},[r12],r4               @load and increment the pi2_src2 iv iteration
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q15,q15,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q4,q6,q7                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q4,q4,q0
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d30,q15,#7
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d31,q4,#7
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d30},[r14],r5              @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r9,r9,#8                    @decrement wd by 4 and check for 0
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d31},[r14],r5              @store pu1_dst iv iteration
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop_8                 @if greater than 0 repeat the core loop again
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_core_loop_8:
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r8,r8,#4                    @decrement the ht by 4
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    asr         r9,r6,#1
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r2,r2,r14
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop_8
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    b           end_loops
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ this is only for chroma module with inpput 4x2
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_chroma_8x2:
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#0                       @check wd == 0
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_loops                   @if equal, then end the function
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_chroma_8x2:
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q12,q12,q13
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d20,q12,#7
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d21,q11,#7
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                            @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r9,r9,#8                    @decrement wd by 4 and check for 0
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop_chroma_8x2        @if greater than 0 repeat the core loop again
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_core_loop_chroma_8x2:
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r8,r8,#2                    @decrement the ht by 4
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    asr         r9,r6,#1
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r14,r9,r5,lsl #1            @4*dst_strd - wd
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r2,r2,r14
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop_chroma_8x2
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    b           end_loops
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_16:
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#0                       @check wd == 0
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_loops                   @if equal, then end the function
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r14,#16
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r10,r14,r5
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r11,r3,r14
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r12,r14,r3
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarprolog_16:
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r9,r9,#16
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subeq       r8,r8,#2
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q11,q1,q2
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q14,q5,q6
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addeq       r0,r0,r7
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addeq       r1,r1,r7
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q12,q3,q4
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q1},[r0]!
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q13,q7,q8
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ if the input is chroma with 8x2 block size
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r8,#0
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         epilog_16
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q11,q11,q0
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q14,q14,q0
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q12,q12,q0
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q15,q13,q0
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d20,q11,#7
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d21,q14,#7
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d26,q12,#7
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d27,q15,#7
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_16:
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#0
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q11,q1,q2
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    asreq       r9,r6,#1
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {q10},[r2],r5
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q14,q5,q6
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {q13},[r2],r10
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addeq       r2,r2,r14
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q12,q3,q4
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r9,r9,#16
4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addeq       r0,r0,r7
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q13,q7,q8
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addeq       r1,r1,r7
4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subeqs      r8,r8,#2                    @decrement the ht by 2
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         epilog_16
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q11,q11,q0
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q14,q14,q0
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q12,q12,q0
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q15,q13,q0
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d20,q11,#7
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d21,q14,#7
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d26,q12,#7
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d27,q15,#7
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    b           core_loop_16
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog_16:
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q11,q11,q0
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q14,q14,q0
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q12,q12,q0
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqadd.s16   q15,q13,q0
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d20,q11,#7
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d21,q14,#7
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d26,q12,#7
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqshrun.s16 d27,q15,#7
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {q10},[r2],r5
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {q13},[r2]
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_core_loop_16:
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_loops:
4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
495