10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  ihevc_weighted_pred_bi.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  contains function definitions for weighted prediction used in inter
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* prediction
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @author
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  parthiban v
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @par list of functions:
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  - ihevc_weighted_pred_bi()
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @remarks
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  none
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @brief
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  does bi-weighted prediction on the arrays pointed by  pi2_src1 and
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* pi2_src2 and stores it at location pointed  by pi2_dst   assumptions : the
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* function is optimized considering the fact width and  height are multiple
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* of 2.
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @par description:
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* off1 + 1) << (shift - 1) ) >> shift
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] pi2_src1
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  pointer to source 1
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] pi2_src2
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  pointer to source 2
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[out] pu1_dst
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  pointer to destination
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] src_strd1
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  source stride 1
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] src_strd2
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  source stride 2
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] dst_strd
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  destination stride
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] wgt0
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  weight to be multiplied to source 1
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] off0
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  offset 0
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] wgt1
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  weight to be multiplied to source 2
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] off1
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  offset 1
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] shift
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  (14 bit depth) + log2_weight_denominator
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] lvl_shift1
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  added before shift and offset
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] lvl_shift2
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  added before shift and offset
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] ht
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  height of the source
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] wd
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  width of the source
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @returns
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @remarks
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  none
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@void ihevc_weighted_pred_bi(word16 *pi2_src1,
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word16 *pi2_src2,
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            uword8 *pu1_dst,
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 src_strd1,
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 src_strd2,
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 dst_strd,
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 wgt0,
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 off0,
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 wgt1,
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 off1,
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 shift,
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 lvl_shift1,
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 lvl_shift2,
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 ht,
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 wd)
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@**************variables vs registers*****************************************
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r0 => *pi2_src1
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r1 => *pi2_src2
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r2 => *pu1_dst
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r3 =>  src_strd1
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r4 =>  src_strd2
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r5 =>  dst_strd
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r6 =>  wgt0
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r7 =>  off0
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r8 =>  wgt1
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r9 =>  off1
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r10 =>  shift
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r11 =>  lvl_shift1
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r12 =>  lvl_shift2
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r14 =>  ht
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r7  =>  wd
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.align 4
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_weighted_pred_bi_a9q
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_weighted_pred_bi_a9q, %function
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_weighted_pred_bi_a9q:
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r6,[sp,#48]                 @load wgt0
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r11,[sp,#68]                @load lvl_shift1
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r12,[sp,#72]                @load lvl_shift2
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.s16    d7[0],r6                    @moved for scalar multiplication
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mul         r4,r11,r6                   @lvl_shift1 * wgt0
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r8,[sp,#56]                 @load wgt1
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r7,[sp,#52]                 @load off0
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.s16    d7[1],r8                    @moved for scalar multiplication
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mla         r4,r12,r8,r4                @(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r9,[sp,#60]                 @load off1
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r5,r7,r9                    @off0 + off1
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r10,[sp,#64]                @load shift
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r5,r5,#1                    @off0 + off1 + 1
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r14,r10,#1                  @shift - 1
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r7,[sp,#80]                 @load wd
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         r5,r5,r14                   @((off0 + off1 + 1) << (shift - 1))
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    q14,r10                     @vmovq_n_s32(0-shift)
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r4,r4,r5                    @tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    q15,r4                      @vmovq_n_s32(tmp_lvl_shift)
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vneg.s32    q14,q14
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r4,[sp,#40]                 @load src_strd2
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         r9,r7,#1
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r5,[sp,#44]                 @load dst_strd
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         r3,r3,#1
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r14,[sp,#76]                @load ht
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         r4,r4,#1
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r14,#0                      @check ht == 0
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_loops                   @if equal, then end the function
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop:
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r7,#0                       @check wd == 0
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_loops                   @if equal, then end the function
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop:
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r6,r0,r3                    @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r8,r1,r4                    @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d0},[r0]!                  @load and increment the pi2_src1
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d1},[r1]!                  @load and increment the pi2_src2
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.s16   q2,d0,d7[0]                 @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d2},[r6],r3                @load and increment the pi2_src_tmp1 ii iteration
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.s16   q4,d1,d7[1]                 @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d3},[r8],r4                @load and increment the pi2_src_tmp1 ii iteration
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.s32    q2,q2,q4                    @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d0},[r6],r3                @load and increment the pi2_src1 iii iteration
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.s16   q5,d2,d7[0]                 @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d1},[r8],r4                @load and increment the pi2_src2 iii iteration
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.s32    q2,q2,q15                   @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.s16   q7,d0,d7[0]                 @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d2},[r6],r3                @load and increment the pi2_src_tmp1 iv iteration
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.s16   q6,d3,d7[1]                 @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.s32    q2,q2,q14                   @vshlq_s32(i4_tmp1_t1, tmp_shift_t)
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d3},[r8],r4                @load and increment the pi2_src_tmp1 iv iteration
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.s32    q5,q5,q6                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovun.s32 d4,q2                       @vqmovun_s32(sto_res_tmp1)
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.s16   q8,d1,d7[1]                 @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.s32    q5,q5,q15                   @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.s32    d5,d4                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.s32    q7,q7,q8                    @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.s32    q5,q5,q14                   @vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.s16   q9,d2,d7[0]                 @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovn.u16  d4,q2                       @vqmovn_u16(sto_res_tmp3)
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.s32    q7,q7,q15                   @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovun.s32 d10,q5                      @vqmovun_s32(sto_res_tmp1) ii iteration
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.s16   q10,d3,d7[1]                @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.s32    q7,q7,q14                   @vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.s32    d11,d10                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.s32    q9,q9,q10                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovun.s32 d14,q7                      @vqmovun_s32(sto_res_tmp1) iii iteration
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.s32    q9,q9,q15                   @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.s32    {d4[0]},[r2]!               @store pu1_dst i iteration
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovn.u16  d10,q5                      @vqmovn_u16(sto_res_tmp3) ii iteration
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.s32    q9,q9,q14                   @vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.s32    {d10[0]},[r10],r5           @store pu1_dst ii iteration
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.s32    d15,d14                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovn.u16  d14,q7                      @vqmovn_u16(sto_res_tmp3) iii iteration
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovun.s32 d18,q9                      @vqmovun_s32(sto_res_tmp1) iv iteration
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.s32    d19,d18                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.s32    {d14[0]},[r10],r5           @store pu1_dst iii iteration
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovn.u16  d18,q9                      @vqmovn_u16(sto_res_tmp3) iv iteration
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r7,r7,#4                    @decrement wd by 4 and check for 0
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.s32    {d18[0]},[r10],r5           @store pu1_dst iv iteration
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop                   @if greater than 0 repeat the core loop again
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_core_loop:
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r11,r9,r3,lsl #2            @2*src_strd1 - wd
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r14,r14,#4                  @decrement the ht by 4
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r12,r9,r4,lsl #2            @2*src_strd2 - wd
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,r11                   @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    asr         r7,r9,#1
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1,r1,r12                   @pi2_src2 + 4*src_strd2 - 2*wd
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r10,r7,r5,lsl #2            @2*dst_strd - wd
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r2,r2,r10                   @pu1_dst + dst_std - wd
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop                   @if ht is greater than 0 goto outer_loop
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_loops:
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
270