10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  ihevc_weighted_pred_uni.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  contains function definitions for weighted prediction used in inter
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* prediction
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @author
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  parthiban v
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @par list of functions:
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  - ihevc_weighted_pred_uni()
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @remarks
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  none
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @brief
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  does uni-weighted prediction on the array pointed by  pi2_src and stores
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* it at the location pointed by pi2_dst assumptions : the function is
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* optimized considering the fact width and  height are multiple of 2.
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @par description:
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* offset
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] pi2_src
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  pointer to the source
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[out] pu1_dst
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  pointer to the destination
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] src_strd
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  source stride
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] dst_strd
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  destination stride
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] wgt0
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  weight to be multiplied to the source
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] off0
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  offset to be added after rounding and
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] shifting
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] shift
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  (14 bit depth) + log2_weight_denominator
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] lvl_shift
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  added before shift and offset
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] ht
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  height of the source
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] wd
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  width of the source
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @returns
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @remarks
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  none
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@void ihevc_weighted_pred_uni(word16 *pi2_src,
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                             uword8 *pu1_dst,
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                             word32 src_strd,
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                             word32 dst_strd,
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                             word32 wgt0,
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                             word32 off0,
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                             word32 shift,
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                             word32 lvl_shift,
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                             word32 ht,
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                             word32 wd)
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@**************variables vs registers*****************************************
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r0 => *pi2_src
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r1 => *pu1_dst
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r2 =>  src_strd
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r3 =>  dst_strd
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r4 =>  wgt0
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r5 =>  off0
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r6 =>  shift
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r7 =>  lvl_shift
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r8 =>   ht
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r9  =>  wd
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.align 4
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_weighted_pred_uni_a9q
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_weighted_pred_uni_a9q, %function
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_weighted_pred_uni_a9q:
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r4,[sp,#40]                 @load wgt0
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r7,[sp,#52]                 @load lvl_shift
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r11,#1
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r5,[sp,#44]                 @load off0
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mul         r10,r7,r4                   @lvl_shift * wgt0
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r6,[sp,#48]                 @load shift
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r8,[sp,#56]                 @load ht
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10,r10,r5,lsl r6           @lvl_shift * wgt0 + (off0 << shift)
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r9,[sp,#60]                 @load wt
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r12,r6,#1
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.s16    d0[0],r4                    @moved for scalar multiplication
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         r2,r2,#1
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    q14,r6                      @vmovq_n_s32(tmp_shift)
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10,r10,r11,lsl r12         @tmp_lvl_shift += (1 << (shift - 1))
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.s32    q15,r10                     @vmovq_n_s32(tmp_lvl_shift)
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vneg.s32    q14,q14
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lsl         r4,r9,#1
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r8,#0                       @check ht == 0
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_loops                   @if equal, then end the function
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop:
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r9,#0                       @check wd == 0
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         end_loops                   @if equal, then end the function
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop:
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r5,r0,r2                    @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r6,r1,r3                    @pu1_dst_tmp = pu1_dst + dst_strd
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d1},[r0]!                  @load and increment the pi2_src
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d2},[r5],r2                @load and increment the pi2_src_tmp ii iteration
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.s16   q2,d1,d0[0]                 @vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.i32    q2,q2,q15                   @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d8},[r5],r2                @load and increment the pi2_src iii iteration
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.s16   q3,d2,d0[0]                 @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.s16    {d9},[r5],r2                @load and increment the pi2_src_tmp iv iteration
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.s32    q2,q2,q14                   @vshlq_s32(i4_tmp1_t, tmp_shift_t)
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.i32    q3,q3,q15                   @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.s16   q5,d8,d0[0]                 @vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovun.s32 d4,q2                       @vqmovun_s32(sto_res_tmp1)
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.i32    q5,q5,q15                   @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.s32    d5,d4                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.s32    q3,q3,q14                   @vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.s16   q6,d9,d0[0]                 @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovn.u16  d4,q2                       @vqmovn_u16(sto_res_tmp3)
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.s32    q5,q5,q14                   @vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovun.s32 d6,q3                       @vqmovun_s32(sto_res_tmp1) ii iteration
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.i32    q6,q6,q15                   @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.s32    d7,d6                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovun.s32 d10,q5                      @vqmovun_s32(sto_res_tmp1) iii iteration
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vshl.s32    q6,q6,q14                   @vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d4[0]},[r1]!               @store pu1_dst i iteration
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.s32    d11,d10                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovn.u16  d6,q3                       @vqmovn_u16(sto_res_tmp3) ii iteration
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d6[0]},[r6],r3             @store pu1_dst ii iteration
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovn.u16  d10,q5                      @vqmovn_u16(sto_res_tmp3) iii iteration
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovun.s32 d12,q6                      @vqmovun_s32(sto_res_tmp1) iv iteration
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.s32    d13,d12                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d10[0]},[r6],r3            @store pu1_dst i iteration iii iteration
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqmovn.u16  d12,q6                      @vqmovn_u16(sto_res_tmp3) iv iteration
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d12[0]},[r6],r3            @store pu1_dst iv iteration
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop                   @if greater than 0 repeat the core loop again
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_core_loop:
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r11,r4,r2,lsl #2            @2*src_strd - wd
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r8,r8,#4                    @decrement the ht by 4
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,r11                   @pi2_src + 4*src_strd - 2*wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement)
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    asr         r9,r4,#1
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r12,r9,r3,lsl #2            @2*dst_strd - wd
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1,r1,r12                   @pu1_dst + dst_std - wd
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         core_loop                   @if ht is greater than 0 goto outer_loop
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_loops:
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
220