10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  ihevc_inter_pred_filters_luma_vert.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  contains function definitions for inter prediction  interpolation.
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* functions are coded using neon  intrinsics and can be compiled using
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* rvct
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @author
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  parthiban v
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @par list of functions:
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  - ihevc_inter_pred_luma_vert()
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @remarks
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  none
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/* include reconstruction */
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @brief
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*     interprediction luma filter for vertical input
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @par description:
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    assumptions : the function is optimized considering the fact width is
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    multiple of 4 or 8. and height as multiple of 2.
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] pu1_src
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  uword8 pointer to the source
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[out] pu1_dst
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  uword8 pointer to the destination
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] src_strd
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  integer source stride
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] dst_strd
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  integer destination stride
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] pi1_coeff
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  word8 pointer to the filter coefficients
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] ht
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  integer height of the array
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] wd
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  integer width of the array
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @returns
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @remarks
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  none
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@void ihevc_inter_pred_luma_vert (
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            uword8 *pu1_src,
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            uword8 *pu1_dst,
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 src_strd,
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 dst_strd,
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word8 *pi1_coeff,
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 ht,
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                            word32 wd   )
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@**************variables vs registers*****************************************
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r0 => *pu1_src
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r1 => *pu1_dst
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r2 =>  src_strd
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r6 =>  dst_strd
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r12 => *pi1_coeff
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r5 =>  ht
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r3 =>  wd
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.align 4
108d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer.syntax unified
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_inter_pred_luma_vert_a9q
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_inter_pred_luma_vert_a9q, %function
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_inter_pred_luma_vert_a9q:
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r12,[sp,#40]                @load pi1_coeff
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r6,r3
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r5,[sp,#48]                 @load wd
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vabs.s8     d0,d0                       @vabs_s8(coeff)
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r3,[sp,#44]                 @load ht
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r7,r3,#0                    @r3->ht
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @ble        end_loops           @end loop jump
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r5,#8
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d26,d0[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d27,d0[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d28,d0[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d29,d0[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    blt         core_loop_wd_4              @core loop wd 4 jump
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    str         r0, [sp, #-4]!
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    str         r1, [sp, #-4]!
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bic         r4,r5,#7                    @r5 ->wd
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r9,r4,r6,lsl #2             @r6->dst_strd   r5  ->wd
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r8,r4,r2,lsl #2             @r2->src_strd
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r3, r5, lsr #3              @divide by 8
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mul         r7, r3                      @multiply height by width
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r7, #4                      @subtract by one for epilog
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarprolog:
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    and         r10, r0, #31
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r4,r4,#8
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addle       r0,r0,r8
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bicle       r4,r5,#7                    @r5 ->wd
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r3]
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r3, r2]
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r3, r2, lsl #1]
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r3, r3, r2
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r3, r2, lsl #1]
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q6,d3,d23
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d2,d22
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d4,d24
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d5,d25
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d6,d26
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d7,d27
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d16,d28
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d17,d29
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r1,r6
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addle       r1,r1,r9
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q7,d4,d23
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r7,r7,#4
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d3,d22
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d5,d24
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d6,d25
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d7,d26
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d16,d27
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d17,d28
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d18,d29
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d12,q6,#6
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    blt         epilog_end                  @jumps to epilog_end
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         epilog                      @jumps to epilog
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarkernel_8:
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r4,r4,#8
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addle       r0,r0,r8
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bicle       r4,r5,#7                    @r5 ->wd
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d12},[r14],r6
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   and         r11, r0, #31
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d14,q7,#6
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d14},[r14],r6
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r1,#0
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1, r1, #8
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addle       r1,r1,r9
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   cmp         r11, r10
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q6,d3,d23
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10, r3, r2, lsl #3         @ 10*strd - 8+2
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d2,d22
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10, r10, r2                @ 11*strd
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d4,d24
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d5,d25
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d6,d26
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d8},[r14],r6               @vst1_u8(pu1_dst,sto_res)@
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r10]                       @11+ 0
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d7,d27
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r10, r2]                   @11+ 1*strd
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d16,d28
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r10, r2, lsl #1]           @11+ 2*strd
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d17,d29
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10, r10, r2                @12*strd
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r10, r2, lsl #1]           @11+ 3*strd
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q7,d4,d23
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   mov         r10, r11
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d3,d22
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r7,r7,#4
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d5,d24
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d6,d25
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d7,d26
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d16,d27
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d17,d28
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d18,d29
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d12,q6,#6
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         kernel_8                    @jumps to kernel_8
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog:
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d12},[r14],r6
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d14,q7,#6
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d14},[r14],r6
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q6,d3,d23
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d2,d22
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d4,d24
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d5,d25
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d6,d26
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d7,d27
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d16,d28
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d17,d29
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r1,r6
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q7,d4,d23
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d3,d22
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d5,d24
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d6,d25
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d7,d26
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d16,d27
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d17,d28
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d18,d29
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d12,q6,#6
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog_end:
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d12},[r14],r6
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d14,q7,#6
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d14},[r14],r6
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_loops:
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tst         r5,#7
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r1, [sp], #4
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r0, [sp], #4
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
410d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer    ldmfdeq     sp!,{r4-r12,r15}            @reload the registers from sp
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r5, #4
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0, r0, #8
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1, r1, #8
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r7, #16
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_wd_4:
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r8,r5,r2,lsl #2             @r2->src_strd
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.i8     d4,#0
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_wd_4:
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r12,r5,#0
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ble         end_inner_loop_wd_4         @outer loop jump
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarinner_loop_wd_4:
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r3,r0,r2
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r12,r12,#4
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d4[0]},[r0]                @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q0,d5,d23                   @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,#4
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q0,d4,d22                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q0,d6,d24                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q4,d7,d23
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d4,d7[1]                    @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q1,d7,d25                   @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d6,d22
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q0,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d4,d24
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q1,d5,d27                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d5,d25
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q0,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d6,d26
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q1,d7,d29                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d4,d7[1]
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.i16    q0,q0,q1                    @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d7,d27
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d4[1]},[r3],r2
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d4,d28
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d5,d4[1]
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d0,q0,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d5[1]},[r3]
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r3,r1,r6
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d5,d29
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d0[1]},[r3],r6             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vqrshrun.s16 d8,q4,#6
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d8[0]},[r3],r6
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1,r1,#4
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d8[1]},[r3]
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         inner_loop_wd_4
4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_inner_loop_wd_4:
4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r7,r7,#4
4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1,r1,r9
4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,r8
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         outer_loop_wd_4
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldmfd       sp!, {r4-r12, r15}          @reload the registers from sp
4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @brief
5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*     interprediction luma filter for vertical 16bit output
5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @par description:
5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    'pu1_dst'  no downshifting or clipping is done and the output is  used as
5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    an input for weighted prediction   assumptions : the function is optimized
5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    considering the fact width is  multiple of 4 or 8. and height as multiple
5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*    of 2.
5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] pu1_src
5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  uword8 pointer to the source
5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[out] pi2_dst
5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  word16 pointer to the destination
5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] src_strd
5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  integer source stride
5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] dst_strd
5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  integer destination stride
5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] pi1_coeff
5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  word8 pointer to the filter coefficients
5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] ht
5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  integer height of the array
5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @param[in] wd
5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  integer width of the array
5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @returns
5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* @remarks
5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  none
5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@void ihevc_inter_pred_luma_vert_w16out(uword8 *pu1_src,
5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word16 *pi2_dst,
5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word32 src_strd,
5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word32 dst_strd,
5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word8 *pi1_coeff,
5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word32 ht,
5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                                    word32 wd   )
5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@**************variables vs registers*****************************************
5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r0 => *pu1_src
5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r1 => *pu1_dst
5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r2 =>  src_strd
5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r6 =>  dst_strd
5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r12 => *pi1_coeff
5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r5 =>  ht
5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   r3 =>  wd
5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_inter_pred_luma_vert_w16out_a9q
5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.type ihevc_inter_pred_luma_vert_w16out_a9q, %function
5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_inter_pred_luma_vert_w16out_a9q:
5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r12,[sp,#40]                @load pi1_coeff
5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r6,r3
5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r5,[sp,#48]                 @load wd
5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vabs.s8     d0,d0                       @vabs_s8(coeff)
5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r3,[sp,#44]                 @load ht
5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r7,r3,#0                    @r3->ht
5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @ble        end_loops_16out         @end loop jump
5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmp         r5,#8
5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d26,d0[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d27,d0[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d28,d0[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u8     d29,d0[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    blt         core_loop_wd_4_16out        @core loop wd 4 jump
5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    str         r0, [sp, #-4]!
5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    str         r1, [sp, #-4]!
5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bic         r4,r5,#7                    @r5 ->wd
5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r9,r4,r6,lsl #2             @r6->dst_strd   r5  ->wd
5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r8,r4,r2,lsl #2             @r2->src_strd
5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r6, r6, lsl #1
5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r3, r5, lsr #3              @divide by 8
5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mul         r7, r3                      @multiply height by width
5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         r7, #4                      @subtract by one for epilog
5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarprolog_16out:
6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    and         r10, r0, #31
6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r4,r4,#8
6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addle       r0,r0,r8
6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bicle       r4,r5,#7                    @r5 ->wd
6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r3]
6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r3, r2]
6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r3, r2, lsl #1]
6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r3, r3, r2
6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r3, r2, lsl #1]
6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q6,d3,d23
6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d2,d22
6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d4,d24
6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d5,d25
6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d6,d26
6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d7,d27
6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d16,d28
6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d17,d29
6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r1,r6
6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d8, d9},[r1]!              @vst1_u8(pu1_dst,sto_res)@
6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addle       r1,r1,r9,lsl #1
6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q7,d4,d23
6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r7,r7,#4
6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d3,d22
6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d5,d24
6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d6,d25
6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d7,d26
6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d16,d27
6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d17,d28
6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d18,d29
6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d10, d11},[r14],r6         @vst1_u8(pu1_dst_tmp,sto_res)@
6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @vqrshrun.s16 d12,q6,#6
6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    blt         epilog_end_16out
6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    beq         epilog_16out                @jumps to epilog
6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarkernel_8_16out:
6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r4,r4,#8
6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addle       r0,r0,r8
6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bicle       r4,r5,#7                    @r5 ->wd
6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d12,d13},[r14],r6
7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   and         r11, r0, #31
7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d14,d15},[r14],r6
7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r1,r6
7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d8,d9},[r1]!               @vst1_u8(pu1_dst,sto_res)@
7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    addle       r1,r1,r9,lsl #1
7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   cmp         r11, r10
7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10, r3, r2, lsl #3         @ 10*strd - 8+2
7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q6,d3,d23
7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10, r10, r2                @ 11*strd
7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d2,d22
7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r10]                       @11+ 0
7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d4,d24
7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r10, r2]                   @11+ 1*strd
7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d5,d25
7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r10, r2, lsl #1]           @11+ 2*strd
7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d6,d26
7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r10, r10, r2                @12*strd
7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d7,d27
7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pld         [r10, r2, lsl #1]           @11+ 3*strd
7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d16,d28
7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@   mov         r10, r11
7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d17,d29
7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q7,d4,d23
7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r7,r7,#4
7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d3,d22
7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d10, d11},[r14],r6         @vst1_u8(pu1_dst_tmp,sto_res)@
7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d5,d24
7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d6,d25
7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d7,d26
7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d16,d27
7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d17,d28
7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d18,d29
7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         kernel_8_16out              @jumps to kernel_8
7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog_16out:
7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d12,d13},[r14],r6
7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @vqrshrun.s16 d14,q7,#6
7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d14,d15},[r14],r6
8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q6,d3,d23
8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d2,d22
8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d4,d24
8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d5,d25
8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d6,d26
8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d7,d27
8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q6,d16,d28
8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q6,d17,d29
8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r1,r6
8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d8,d9},[r1]!               @vst1_u8(pu1_dst,sto_res)@
8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q7,d4,d23
8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d3,d22
8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d5,d24
8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d6,d25
8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d7,d26
8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d16,d27
8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q7,d17,d28
8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q7,d18,d29
8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d10,d11},[r14],r6          @vst1_u8(pu1_dst_tmp,sto_res)@
8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @vqrshrun.s16 d12,q6,#6
8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarepilog_end_16out:
8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d12,d13},[r14],r6
8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @vqrshrun.s16 d14,q7,#6
8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.8      {d14,d15},[r14],r6
8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_loops_16out:
8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tst         r5,#7
8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r1, [sp], #4
8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         r0, [sp], #4
8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
851d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer    ldmfdeq     sp!,{r4-r12,r15}            @reload the registers from sp
8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r5, #4
8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0, r0, #8
8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1, r1, #16
8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r7, #16
8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r6, r6, lsr #1
8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @
8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarcore_loop_wd_4_16out:
8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    rsb         r8,r5,r2,lsl #2             @r2->src_strd
8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmov.i8     d4,#0
8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         r6, r6, lsl #1
8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarouter_loop_wd_4_16out:
8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r12,r5,#0
8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ble         end_inner_loop_wd_4_16out   @outer loop jump
8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarinner_loop_wd_4_16out:
8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r3,r0,r2
8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r12,r12,#4
8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d4[0]},[r0]                @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q0,d5,d23                   @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,#4
8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q0,d4,d22                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q0,d6,d24                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q4,d7,d23
8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d4,d7[1]                    @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmull.u8    q1,d7,d25                   @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d6,d22
8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q0,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d4,d24
8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q1,d5,d27                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d5,d25
9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q0,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d6,d26
9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q1,d7,d29                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d4,d7[1]
9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vadd.i16    q0,q0,q1                    @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d7,d27
9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d4[1]},[r3],r2
9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlal.u8    q4,d4,d28
9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vdup.u32    d5,d4[1]
9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @vqrshrun.s16 d0,q0,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vld1.u32    {d5[1]},[r3]
9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r3,r1,r6
9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d0},[r1]!                  @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vmlsl.u8    q4,d5,d29
9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d1},[r3],r6                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @vqrshrun.s16 d8,q4,#6
9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d8},[r3],r6
9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    @add        r1,r1,#4
9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    vst1.32     {d9},[r3]
9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         inner_loop_wd_4_16out
9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarend_inner_loop_wd_4_16out:
9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    subs        r7,r7,#4
9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r1,r1,r9,lsl #1
9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r0,r0,r8
9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    bgt         outer_loop_wd_4_16out
9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldmfd       sp!, {r4-r12, r15}          @reload the registers from sp
9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
948