18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/******************************************************************************
28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Copyright (C) 2015 The Android Open Source Project
48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Licensed under the Apache License, Version 2.0 (the "License");
68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * you may not use this file except in compliance with the License.
78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * You may obtain a copy of the License at:
88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * http://www.apache.org/licenses/LICENSE-2.0
108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Unless required by applicable law or agreed to in writing, software
128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * distributed under the License is distributed on an "AS IS" BASIS,
138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * See the License for the specific language governing permissions and
158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * limitations under the License.
168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *****************************************************************************
188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*/
208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*****************************************************************************/
218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                                                                           */
228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*  File Name         : ih264_deblk_luma_a9.s                                */
238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                                                                           */
248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*  Description       : Contains function definitions for deblocking luma    */
258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                      edge. Functions are coded in NEON assembly and can   */
268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                      be compiled using ARM RVDS.                          */
278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                                                                           */
288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*  List of Functions : ih264_deblk_luma_vert_bs4_a9()                       */
298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                      ih264_deblk_luma_vert_bslt4_a9()                     */
308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                      ih264_deblk_luma_horz_bs4_a9()                       */
318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                      ih264_deblk_luma_horz_bslt4_a9()                     */
328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                      ih264_deblk_luma_vert_bs4_mbaff_a9()                 */
338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                      ih264_deblk_luma_vert_bslt4_mbaff_a9()               */
348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                                                                           */
358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*  Issues / Problems : None                                                 */
368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                                                                           */
378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*  Revision History  :                                                      */
388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                                                                           */
398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*         28 11 2013   Ittiam          Draft                                */
418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*         05 01 2015   Kaushik         Added double-call functions for      */
428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                      Senthoor        vertical deblocking.                 */
438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*                                                                           */
448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/*****************************************************************************/
458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.text
488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.p2align 2
498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
507497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@**
518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*******************************************************************************
528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @brief
548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*     Performs filtering of a luma block horizontal edge for cases where the
558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*     boundary strength is less than 4
568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @par Description:
588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*    This operation is described in  Sec. 8.7.2.4 under the title
598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r0 - pu1_src
628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Pointer to the src sample q0
638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r1 - src_strd
658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Source stride
668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r2 - alpha
688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Alpha Value for the boundary
698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r3 - beta
718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Beta Value for the boundary
728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] sp(0) - u4_bs
748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Packed Boundary strength array
758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] sp(4) - pu1_cliptab
778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  tc0_table
788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @returns
808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  None
818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @remarks
838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  None
848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*******************************************************************************
867497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@*
878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_deblk_luma_horz_bslt4_a9
898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_deblk_luma_horz_bslt4_a9:
918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    stmfd         sp!, {r4-r7, lr}
938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldrd          r4, r5, [sp, #0x14]   @r4 = ui_Bs , r5 = *puc_ClpTab
958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpush         {d8 - d15}
968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r0, r0, r1, lsl #1    @R1 = uc_Horizonpad
978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r0, r0, r1            @r0 pointer to p2
988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    rev           r4, r4                @
998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {q5}, [r0], r1        @p2 values are loaded into q5
1008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.32       d12[0], r4            @d12[0] = ui_Bs
1018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov           r6, r0                @keeping backup of pointer to p1
1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {q4}, [r0], r1        @p1 values are loaded into q4
1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov           r7, r0                @keeping backup of pointer to p0
1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {q3}, [r0], r1        @p0 values are loaded into q3
1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovl.u8      q6, d12               @q6 = uc_Bs in each 16 bt scalar
1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {q0}, [r0], r1        @q0 values are loaded into q0
1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q13, q4, q3           @Q13 = ABS(p1 - p0)
1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {q1}, [r0], r1        @q1 values are loaded into q1
1098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q11, q3, q0           @Q11 = ABS(p0 - q0)
1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d16[0], [r5]          @D16[0] contains cliptab
1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q12, q1, q0           @Q12 = ABS(q1 - q0)
1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {q2}, [r0], r1        @q2 values are loaded into q2
1138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtbl.8        d14, {d16}, d12       @
1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.8        q10, r2               @Q10 contains alpha
1158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.8        q8, r3                @Q8 contains beta
1168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovl.u16     q6, d12               @
1178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovl.u16     q7, d14               @
1188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q14, q5, q3           @Q14 = Ap = ABS(p2 - p0)
1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q15, q2, q0           @Q15 = Aq = ABS(q2 - q0)
1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcgt.s32      q6, q6, #0            @Q6 = (us_Bs > 0)
1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsli.32       q7, q7, #8            @
1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       q9, q11, q10          @Q9 = ( ABS(p0 - q0) >= Alpha )
1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       q12, q12, q8          @Q12=( ABS(q1 - q0) >= Beta )
1248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       q13, q13, q8          @Q13=( ABS(p1 - p0) >= Beta )
1258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcgt.u8       q10, q8, q14          @Q10=(Ap<Beta)
1268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcgt.u8       q11, q8, q15          @Q11=(Aq<Beta)
1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsli.32       q7, q7, #16           @Q7  = C0
1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          q9, q9, q12           @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.u8      q15, d1, d7           @
1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.u8      q12, d0, d6           @Q15,Q12 = (q0 - p0)
1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          q9, q9, q13           @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.u8      q14, d8, d2           @Q14 = (p1 - q1)L
1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.i16      q13, q15, #2          @Q13 = (q0 - p0)<<2
1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.i16      q12, q12, #2          @Q12 = (q0 - p0)<<2
1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.u8      q15, d9, d3           @Q15 = (p1 - q1)H
1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          q6, q6, q9            @final condition
1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i16      q12, q12, q14         @
1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i16      q13, q13, q15         @Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1)
1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.i8       q9, q7, q10           @Q9 = C0 + (Ap < Beta)
1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrhadd.u8     q8, q3, q0            @Q8 = ((p0+q0+1) >> 1)
1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s16   d24, q12, #3          @
1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s16   d25, q13, #3          @Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.i8       q9, q9, q11           @Q9 = C0 + (Ap < Beta) + (Aq < Beta)
1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand.i8       q10, q10, q6          @
1458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand.i8       q11, q11, q6          @
1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabs.s8       q13, q12              @Q13 = ABS (i_macro)
1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q14, d17, d11         @
1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q5, d16, d10          @Q14,Q5 = p2 + (p0+q0+1)>>1
1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q15, d17, d5          @
1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmin.u8       q9, q13, q9           @Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshll.u8      q13, d9, #1           @
1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q2, d16, d4           @Q15,Q2 = q2 + (p0+q0+1)>>1
1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshll.u8      q8, d8, #1            @Q13,Q8 = (p1<<1)
1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand          q9, q9, q6            @Making delta zero in places where values shouldn be filterd
1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.i16      q14, q14, q13         @Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1)
1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.i16      q5, q5, q8            @
1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshll.u8      q8, d2, #1            @
1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshll.u8      q13, d3, #1           @Q13,Q8 = (q1<<1)
1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqshrn.s16    d29, q14, #1          @
1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqshrn.s16    d28, q5, #1           @Q14 = i_macro_p1
1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.i16      q2, q2, q8            @
1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.i16      q15, q15, q13         @Q15,Q2  = [q2 + (p0+q0+1)>>1] - (q1<<1)
1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vneg.s8       q13, q7               @Q13 = -C0
1648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmin.s8       q14, q14, q7          @Q14 = min(C0,i_macro_p1)
1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.s8       q12, q12, #0          @Q12 = (i_macro >= 0)
1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqshrn.s16    d31, q15, #1          @
1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqshrn.s16    d30, q2, #1           @Q15 = i_macro_q1
1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmax.s8       q14, q14, q13         @Q14 = max( - C0 , min(C0, i_macro_p1) )
1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqadd.u8      q8, q3, q9            @Q8  = p0 + delta
1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqsub.u8      q3, q3, q9            @Q3 = p0 - delta
1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmin.s8       q15, q15, q7          @Q15 = min(C0,i_macro_q1)
1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand.i8       q14, q10, q14         @condition check Ap<beta
1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqadd.u8      q7, q0, q9            @Q7 = q0 + delta
1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqsub.u8      q0, q0, q9            @Q0   = q0 - delta
1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmax.s8       q15, q15, q13         @Q15 = max( - C0 , min(C0, i_macro_q1) )
1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          q8, q3, q12           @Q8  = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          q0, q7, q12           @Q0  = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i8       q14, q14, q4          @
1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand.i8       q15, q11, q15         @condition check Aq<beta
1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        {q8}, [r7], r1        @writting back filtered value of p0
1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i8       q15, q15, q1          @
1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        {q0}, [r7], r1        @writting back filtered value of q0
1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        {q14}, [r6]           @writting back filtered value of p1
1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        {q15}, [r7], r1       @writting back filtered value of q1
1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpop          {d8 - d15}
1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldmfd         sp!, {r4-r7, pc}
1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1907497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@**
1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*******************************************************************************
1928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @brief
1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*     Performs filtering of a luma block horizontal edge when the
1958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*     boundary strength is set to 4
1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @par Description:
1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*    This operation is described in  Sec. 8.7.2.4 under the title
1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r0 - pu1_src
2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Pointer to the src sample q0
2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r1 - src_strd
2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Source stride
2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r2 - alpha
2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Alpha Value for the boundary
2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r3 - beta
2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Beta Value for the boundary
2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @returns
2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  None
2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @remarks
2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  None
2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*******************************************************************************
2207497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@*
2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_deblk_luma_horz_bs4_a9
2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_deblk_luma_horz_bs4_a9:
2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @ Back up necessary registers on stack
2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    stmfd         sp!, {r12, r14}
2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpush         {d8 - d15}
2298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @ Init
2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.8        q0, r2                @duplicate alpha
2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r12, r0, r1           @pointer to p0 = q0 - src_strd
2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.8        q1, r3                @duplicate beta
2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r14, r0, r1, lsl#1    @pointer to p1 = q0 - src_strd*2
2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r2, r0, r1, lsl#2     @pointer to p3 = q0 - src_strd*4
2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r3, r14, r1           @pointer to p2 = p1 - src_strd
2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @ Load Data
2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {d4, d5}, [r0], r1    @load q0 to Q2, q0 = q0 + src_strd
2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {d6, d7}, [r12]       @load p0 to Q3
2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {d8, d9}, [r0], r1    @load q1 to Q4, q0 = q0 + src_strd
2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {d10, d11}, [r14]     @load p1 to Q5
2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @ Filter Decision
2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q6, q2, q3            @ABS(p0 - q0)
2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q7, q4, q2            @ABS(q1 - q0)
2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q8, q5, q3            @ABS(p1 - p0)
2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       q9, q6, q0            @ABS(p0 - q0) >= Alpha
2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       q7, q7, q1            @ABS(q1 - q0) >= Beta
2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       q8, q8, q1            @ABS(p1 - p0) >= Beta
2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.i8       q10, #2
2518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          q9, q9, q7            @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta
2528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {d14, d15}, [r0], r1  @load q2 to Q7, q0 = q0 + src_strd
2538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          q9, q9, q8            @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta
2548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsra.u8       q10, q0, #2           @((Alpha >> 2) + 2)
2558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q11, q7, q2           @Aq  = ABS(q2 - q0)
2568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q12, d4, d6           @p0+q0 L
2578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q13, d5, d7           @p0+q0 H
2588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vclt.u8       q11, q11, q1          @Aq < Beta
2598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vclt.u8       q10, q6, q10          @(ABS(p0 - q0) <((Alpha >>2) + 2))
2608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @ Deblock Filtering q0', q1', q2'
2628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q14, q12, d8          @p0+q0+q1 L
2638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q15, q13, d9          @p0+q0+q1 H
2648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand          q11, q11, q10         @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
2658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @ q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE
2668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i16      q8, q14, q14          @2*(p0+q0+q1)L
2678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i16      q0, q15, q15          @2*(p0+q0+q1)H
2688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q8, q8, d14           @2*(p0+q0+q1)+q2 L
2698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q0, q0, d15           @2*(p0+q0+q1)+q2 H
2708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q8, q8, d10           @2*(p0+q0+q1)+q2 +p1 L
2718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q0, q0, d11           @2*(p0+q0+q1)+q2 +p1 H
2728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d12, q8, #3           @(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0']
2738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d13, q0, #3           @(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0']
2748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @ q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE
2758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q8, d8, d8            @2*q1 L
2768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q0, d9, d9            @2*q1 H
2778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q8, q8, d4            @2*q1+q0 L
2788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q0, q0, d5            @2*q1+q0 H
2798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q8, q8, d10           @2*q1+q0+p1  L
2808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q0, q0, d11           @2*q1+q0+p1 H
2818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d16, q8, #2           @(2*q1+q0+p1+2)>>2 L [q0"]
2828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d17, q0, #2           @(2*q1+q0+p1+2)>>2 H [q0"]
2838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @ q1'
2848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q14, q14, d14         @p0+q0+q1+q2 L
2858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q15, q15, d15         @p0+q0+q1+q2 H
2868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {q0}, [r0], r1        @load q3 to Q0, q0 = q0 + src_strd
2878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit          q8, q6, q11           @choosing between q0' and q0" depending on condn
2888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r0, r0, r1, lsl #2    @pointer to q0
2898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          q11, q11, q9          @((ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
2908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                        @ && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
2918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d12, q14, #2          @(p0+q0+q1+q2+2)>>2 L [q1']
2928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d13, q15, #2          @(p0+q0+q1+q2+2)>>2 H [q1']
2938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          q2, q8, q9            @choose q0 or filtered q0
2948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @ q2'
2958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q8, d14, d0           @q2+q3,L
2968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q0, d15, d1           @q2+q3,H
2978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i16      q14, q14, q8          @p0+q0+q1+2*q2+q3 L
2988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        {d4, d5}, [r0], r1    @store q0
2998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i16      q15, q15, q0          @p0+q0+q1+2*q2+q3 H
3008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i16      q14, q14, q8          @p0+q0+q1+3*q2+2*q3 L
3018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i16      q15, q15, q0          @p0+q0+q1+3*q2+2*q3 H
3028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d0, q14, #3           @(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2']
3038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d1, q15, #3           @(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2']
3048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {d30, d31}, [r3]      @load p2 to Q15
3058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          q6, q4, q11           @choose q1 or filtered value of q1
3068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q8, q15, q3           @Ap,ABS(p2 - p0)
3088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q12, q12, d10         @p0+q0+p1 L
3098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          q0, q7, q11           @choose q2 or filtered q2
3108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q13, q13, d11         @p0+q0+p1 H
3118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        {d12, d13}, [r0], r1  @store q1
3128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vclt.u8       q8, q8, q1            @Ap < Beta
3138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i16      q14, q12, q12         @2*(p0+q0+p1) L
3148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i16      q2, q13, q13          @2*(p0+q0+p1) H
3158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        {d0, d1}, [r0], r1    @store q2
3168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand          q10, q10, q8          @((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2)))
3178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q14, q14, d30         @2*(p0+q0+p1)+p2 l
3188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q2, q2, d31           @2*(p0+q0+p1)+p2 H
3198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q14, q14, d8          @2*(p0+q0+p1)+p2+q1 L
3208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q2, q2, d9            @2*(p0+q0+p1)+p2+q1 H
3218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d28, q14, #3          @(2*(p0+q0+p1)+p2+q1+4)>>3  L,p0'
3228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d29, q2, #3           @(2*(p0+q0+p1)+p2+q1+4)>>3  H,p0'
3238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.i8       d0, #2
3248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.i16      d1, #2
3258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q1, d6, d8            @p0+q1      L
3268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmlal.u8      q1, d10, d0           @2*p1+p0+q1 L
3278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q8, d7, d9            @p0+q1  H
3288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmlal.u8      q8, d11, d0           @2*p1+p0+q1 H
3298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q6, q12, d30          @(p0+q0+p1) +p2 L
3308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {d24, d25}, [r2]      @load p3,Q12
3318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q2, q13, d31          @(p0+q0+p1) +p2 H
3328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q4, d30, d24          @p2+p3 L
3338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d26, q6, #2           @((p0+q0+p1)+p2 +2)>>2,p1' L
3348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d2, q1, #2            @(2*p1+p0+q1+2)>>2,p0"L
3358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d27, q2, #2           @((p0+q0+p1)+p2 +2)>>2,p1' H
3368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d3, q8, #2            @(2*p1+p0+q1+2)>>2,p0" H
3378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q8, d31, d25          @p2+p3 H
3388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmla.u16      q6, q4, d1[0]         @(p0+q0+p1)+3*p2+2*p3 L
3398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmla.u16      q2, q8, d1[0]         @(p0+q0+p1)+3*p2+2*p3 H
3408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          q8, q10, q9           @((ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
3418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                        @&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
3428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit          q1, q14, q10          @choosing between po' and p0"
3438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d12, q6, #3           @((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2'
3448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d13, q2, #3           @((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2'
3458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          q3, q1, q9            @choosing between p0 and filtered value of p0
3468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit          q5, q13, q8           @choosing between p1 and p1'
3478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit          q15, q6, q8           @choosing between p2 and p2'
3488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        {d6, d7}, [r12]       @store p0
3498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        {d10, d11}, [r14]     @store p1
3508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        {d30, d31}, [r3]      @store p2
3518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpop          {d8 - d15}
3528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldmfd         sp!, {r12, pc}
3538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3567497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@**
3578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*******************************************************************************
3588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
3598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @brief
3608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*     Performs filtering of a luma block vertical edge for cases where the
3618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*     boundary strength is less than 4
3628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
3638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @par Description:
3648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*    This operation is described in  Sec. 8.7.2.4 under the title
3658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
3668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
3678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r0 - pu1_src
3688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Pointer to the src sample q0
3698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
3708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r1 - src_strd
3718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Source stride
3728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
3738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r2 - alpha
3748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Alpha Value for the boundary
3758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
3768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r3 - beta
3778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Beta Value for the boundary
3788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
3798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] sp(0) - u4_bs
3808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Packed Boundary strength array
3818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
3828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] sp(4) - pu1_cliptab
3838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  tc0_table
3848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
3858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @returns
3868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  None
3878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
3888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @remarks
3898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  None
3908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
3918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*******************************************************************************
3927497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@*
3938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_deblk_luma_vert_bslt4_a9
3958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_deblk_luma_vert_bslt4_a9:
3978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    stmfd         sp!, {r12, lr}
3998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r0, r0, #4            @pointer uc_edgePixel-4
4018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r12, [sp, #8]         @r12 = ui_Bs
4028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r14, [sp, #12]        @r14 = *puc_ClpTab
4038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpush         {d8 - d15}
4048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
4058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        {d0}, [r0], r1        @row1
4068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d2, [r0], r1          @row2
4078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d4, [r0], r1          @row3
4088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    rev           r12, r12              @reversing ui_bs
4098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d6, [r0], r1          @row4
4108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.32       d18[0], r12           @d12[0] = ui_Bs
4118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d16[0], [r14]         @D16[0] contains cliptab
4128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d8, [r0], r1          @row5
4138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovl.u8      q9, d18               @q6 = uc_Bs in each 16 bt scalar
4148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d10, [r0], r1         @row6
4158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d12, [r0], r1         @row7
4168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtbl.8        d16, {d16}, d18       @puc_ClipTab[uc_Bs]
4178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d14, [r0], r1         @row8
4188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d1, [r0], r1          @row9
4198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovl.u16     q8, d16               @
4208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d3, [r0], r1          @row10
4218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d5, [r0], r1          @row11
4228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d7, [r0], r1          @row12
4238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsli.32       q8, q8, #8            @
4248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d9, [r0], r1          @row13
4258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d11, [r0], r1         @row14
4268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d13, [r0], r1         @row15
4278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsli.32       q8, q8, #16           @Q8  = C0
4288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d15, [r0], r1         @row16
4298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @taking two 8x8 transposes
4318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @2X2 transposes
4328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d0, d2                @row1 &2
4338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d4, d6                @row3&row4
4348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d8, d10               @row5&6
4358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d12, d14              @row7 & 8
4368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d1, d3                @row9 &10
4378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d5, d7                @row11 & 12
4388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d9, d11               @row13 &14
4398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d13, d15              @row15 & 16
4408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @4x4 transposes
4418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d2, d6                @row2 & row4
4428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d10, d14              @row6 & row8
4438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d3, d7                @row10 & 12
4448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d11, d15              @row14 & row16
4458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d6, d14               @row4 & 8
4468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d7, d15               @row 12 & 16
4478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @now Q3 ->p0 and Q7->q3
4498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d0, d4                @row1 & 3
4508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d8, d12               @row 5 & 7
4518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d1, d5                @row9 & row11
4528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d9, d13               @row13 & row15
4538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d0, d8                @row1 & row5
4548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d1, d9                @row9 & 13
4558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @now Q0->p3 & Q4->q0
4578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @starting processing as p0 and q0 are now ready
4588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d2, d10               @row2 &6
4598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrhadd.u8     q10, q3, q4           @((p0 + q0 + 1) >> 1)
4608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d3, d11               @row10&row14
4618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.i8       d19, #2
4628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @now Q1->p2     & Q5->q1
4638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d4, d12               @row3 & 7
4648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q11, q3, q4           @ABS(p0 - q0)
4658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d5, d13               @row11 & row15
4668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q12, d20, d2          @(p2 + ((p0 + q0 + 1) >> 1) L
4678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @now            Q2->p1,Q6->q2
4688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q13, d21, d3          @(p2 + ((p0 + q0 + 1) >> 1) H
4698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmlsl.u8      q12, d4, d19          @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L
4708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmlsl.u8      q13, d5, d19          @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H
4718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.8        q14, r2               @alpha
4728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcle.u8       q11, q14, q11         @ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
4738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.i8       q14, r3               @beta
4748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q15, q5, q4           @ABS(q1 - q0)
4758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqshrn.s16    d24, q12, #1          @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L
4768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqshrn.s16    d25 , q13, #1         @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H
4778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       q15, q15, q14         @ABS(q1 - q0) >= Beta
4788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q13, q2, q3           @ABS(p1 - p0)
4798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmin.s8       q12, q12, q8          @min(deltap1 ,C0)
4808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          q11, q11, q15         @ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
4818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vneg.s8       q15, q8               @-C0
4828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       q13, q13, q14         @ABS(p1 - p0) >= Beta
4838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmax.s8       q12, q12, q15         @max(deltap1,-C0)
4848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          q11, q11, q13         @ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta ||  ABS(p1 - p0) >= Beta)
4858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovl.u16     q13, d18              @ui_bs
4868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q9, d20, d12          @q2 + ((p0 + q0 + 1) >> 1) L
4878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vceq.u32      q13, q13, #0          @ui_bs == 0
4888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.u8      q9, q9, d10           @(q2 + ((p0 + q0 + 1) >> 1) - q1) L
4898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q10, d21, d13         @q2 + ((p0 + q0 + 1) >> 1) H
4908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.u8      q9, q9, d10           @(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L
4918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.u8      q10, q10, d11         @(q2 + ((p0 + q0 + 1) >> 1) - q1) H
4928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          q13, q13, q11         @(ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs)
4938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.u8      q10, q10, d11         @(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H
4948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqshrn.s16    d18, q9, #1           @((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L
4958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q11, q1, q3           @Ap = ABS(p2 - p0)
4968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqshrn.s16    d19, q10, #1          @((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H
4978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q10, q6, q4           @Aq= ABS(q2 - q0)
4988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vclt.u8       q11, q11, q14         @Ap < Beta
4998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmin.s8       q9, q9, q8            @min(delatq1,C0)
5008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vclt.u8       q10, q10, q14         @Aq <Beta
5018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.u8      q14, d8, d6           @(q0 - p0) L
5028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmax.s8       q9, q9, q15           @max(deltaq1,-C0)
5038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.u8      q15, d9, d7           @(q0 - p0) H
5048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s16      q14, q14, #2          @(q0 - p0)<<2 L
5058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.u8       q8, q8, q11           @C0 + (Ap < Beta)
5068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s16      q15, q15, #2          @(q0 - p0) << 2) H
5078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q14, q14, d4          @((q0 - p0) << 2) + (p1  L
5088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q15, q15, d5          @((q0 - p0) << 2) + (p1 H
5098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.u8      q14, q14, d10         @((q0 - p0) << 2) + (p1 - q1) L
5108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.u8      q15, q15, d11         @((q0 - p0) << 2) + (p1 - q1) H
5118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          q11, q11, q13         @final condition for p1
5128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.s16    d28, q14, #3          @delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L
5138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.s16    d29, q15, #3          @delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H
5148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.u8       q8, q8, q10           @C0 + (Ap < Beta) + (Aq < Beta)
5158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          q10, q10, q13         @final condition for q1
5168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabs.s8       q15, q14              @abs(delta)
5178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand          q12, q12, q11         @delatp1
5188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand          q9, q9, q10           @delta q1
5198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmin.u8       q15, q15, q8          @min((abs(delta),C)
5208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i8       q2, q2, q12           @p1+deltap1
5218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i8       q5, q5, q9            @q1+deltaq1
5228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          q15, q15, q13         @abs(delta) of pixels to be changed only
5238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.s8       q14, q14, #0          @sign(delta)
5248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqsub.u8      q11, q3, q15          @clip(p0-delta)
5258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d0, d2                @row1 &2
5268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqadd.u8      q3, q3, q15           @clip(p0+delta)
5278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d1, d3                @row9 &10
5288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqadd.u8      q12, q4, q15          @clip(q0+delta)
5298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d12, d14              @row7 & 8
5308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqsub.u8      q4, q4, q15           @clip(q0-delta)
5318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d13, d15              @row15 & 16
5328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          q3, q11, q14          @p0
5338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          q4, q12, q14          @q0
5348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d4, d6                @row3&row4
5358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d8, d10               @row5&6
5368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d5, d7                @row11 & 12
5378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d9, d11               @row13 &14
5388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d2, d6                @row2 & row4
5398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d10, d14              @row6 & row8
5408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d3, d7                @row10 & 12
5418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d11, d15              @row14 & row16
5428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d6, d14               @row4 & 8
5438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d7, d15               @row 12 & 16
5448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @now Q3 ->p0 and Q7->q3
5458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d0, d4                @row1 & 3
5468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d8, d12               @row 5 & 7
5478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d1, d5                @row9 & row11
5488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d9, d13               @row13 & row15
5498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r0, r0, r1, lsl#4     @restore pointer
5508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d0, d8                @row1 & row5
5518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d1, d9                @row9 & 13
5528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d2, d10               @row2 &6
5538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d3, d11               @row10&row14
5548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d4, d12               @row3 & 7
5558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d5, d13               @row11 & row15
5568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        {d0}, [r0], r1        @row1
5578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d2, [r0], r1          @row2
5588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d4, [r0], r1          @row3
5598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d6, [r0], r1          @row4
5608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d8, [r0], r1          @row5
5618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d10, [r0], r1         @row6
5628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d12, [r0], r1         @row7
5638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d14, [r0], r1         @row8
5648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d1, [r0], r1          @row9
5658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d3, [r0], r1          @row10
5668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d5, [r0], r1          @row11
5678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d7, [r0], r1          @row12
5688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d9, [r0], r1          @row13
5698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d11, [r0], r1         @row14
5708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d13, [r0], r1         @row15
5718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d15, [r0], r1         @row16
5728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpop          {d8 - d15}
5738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldmfd         sp!, {r12, pc}
5748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5777497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@**
5788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*******************************************************************************
5798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
5808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @brief
5818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*     Performs filtering of a luma block vertical edge when the
5828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*     boundary strength is set to 4
5838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
5848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @par Description:
5858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*    This operation is described in  Sec. 8.7.2.4 under the title
5868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
5878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
5888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r0 - pu1_src
5898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Pointer to the src sample q0
5908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
5918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r1 - src_strd
5928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Source stride
5938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
5948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r2 - alpha
5958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Alpha Value for the boundary
5968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
5978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r3 - beta
5988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Beta Value for the boundary
5998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
6008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @returns
6018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  None
6028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
6038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @remarks
6048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  None
6058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
6068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*******************************************************************************
6077497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@*
6088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_deblk_luma_vert_bs4_a9
6108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_deblk_luma_vert_bs4_a9:
6128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    stmfd         sp!, {r12, lr}
6148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpush         {d8 - d15}
6158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r0, r0, #4            @pointer uc_edgePixel-4
6168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
6178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d0, [r0], r1          @row1
6188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d2, [r0], r1          @row2
6198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d4, [r0], r1          @row3
6208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d6, [r0], r1          @row4
6218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d8, [r0], r1          @row5
6228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d10, [r0], r1         @row6
6238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d12, [r0], r1         @row7
6248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d14, [r0], r1         @row8
6258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d1, [r0], r1          @row9
6268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d3, [r0], r1          @row10
6278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d5, [r0], r1          @row11
6288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d7, [r0], r1          @row12
6298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d9, [r0], r1          @row13
6308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d11, [r0], r1         @row14
6318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d13, [r0], r1         @row15
6328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.8        d15, [r0], r1         @row16
6338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @taking two 8x8 transposes
6348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @2X2 transposes
6358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d0, d2                @row1 &2
6368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d4, d6                @row3&row4
6378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d8, d10               @row5&6
6388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d12, d14              @row7 & 8
6398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d1, d3                @row9 &10
6408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d5, d7                @row11 & 12
6418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d9, d11               @row13 &14
6428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d13, d15              @row15 & 16
6438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @4x4 transposes
6448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d2, d6                @row2 & row4
6458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d10, d14              @row6 & row8
6468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d3, d7                @row10 & 12
6478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d11, d15              @row14 & row16
6488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d6, d14               @row4 & 8
6498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d7, d15               @row 12 & 16
6508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @now Q3 ->p0 and Q7->q3
6518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d0, d4                @row1 & 3
6528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d8, d12               @row 5 & 7
6538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d1, d5                @row9 & row11
6548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d9, d13               @row13 & row15
6558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d0, d8                @row1 & row5
6568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d1, d9                @row9 & 13
6578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @now Q0->p3 & Q4->q0
6588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @starting processing as p0 and q0 are now ready
6598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @now Q1->p2 & Q5->q1
6608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpush         {q7}                  @saving in stack
6618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d4, d12               @row3 & 7
6628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.i16      q14, #2
6638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d5, d13               @row11 & row15
6648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q8, d6, d8            @p0+q0 L
6658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d2, d10               @row2 &6
6668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q9, d7, d9            @p0+q0 H
6678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d3, d11               @row10&row14
6688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q10, q8, d4           @p0+q0+p1 L
6698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q11, q9, d5           @p0+q0+p1 H
6708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q12, d2, d10          @p2+q1 L
6718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q13, d3, d11          @p2+q1 H
6728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmla.u16      q12, q10, q14         @p2 + X2(p1) + X2(p0) + X2(q0) + q1 L
6738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmla.u16      q13, q11, q14         @p2 + X2(p1) + X2(p0) + X2(q0) + q1 H
6748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.i8       q14, #2
6758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q8, q10, d2           @p0+q0+p1+p2 L
6768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q9, q11, d3           @p0+q0+p1+p2 H
6778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.i8       q15, r2               @duplicate alpha
6788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d20, q8, #2           @(p2 + p1 + p0 + q0 + 2) >> 2)L p1'
6798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d21, q9, #2           @(p2 + p1 + p0 + q0 + 2) >> 2)H p1'
6808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q11, q3, q4           @ABD(p0-q0)
6818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsra.u8       q14, q15, #2          @alpha >>2 +2
6828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q15, q1, q3           @Ap = ABD(p2-p0)
6838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d24, q12, #3          @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0'
6848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d25, q13, #3          @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0'
6858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.i8       q13, r3               @beta
6868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcgt.u8       q14, q14, q11         @ABS(p0 - q0) <((Alpha >>2) + 2)
6878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q11, d6, d10          @p0+q1 L
6888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcgt.u8       q7, q13, q15          @beta>Ap
6898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q15, d7, d11          @p0+q1 H
6908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q11, q11, d4          @p0+q1+p1 L
6918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q15, q15, d5          @p0+q1+p1 H
6928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q11, q11, d4          @p0+q1+2*p1 L
6938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q15, q15, d5          @p0+q1+2*p1 H
6948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand          q7, q7, q14           @(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
6958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d22, q11, #2          @((X2(p1) + p0 + q1 + 2) >> 2) L p0"
6968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d23, q15, #2          @((X2(p1) + p0 + q1 + 2) >> 2) H p0"
6978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q15, d2, d0           @p2+p3 L
6988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          q12, q11, q7          @p0' or p0 "
6998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q11, d3, d1           @p2+p3 H
7008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.u16      q15, q15, q15         @2*(p2+p3) L
7018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.u16      q11, q11, q11         @2*(p2+p3)H
7028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.u16      q8, q8, q15           @(X2(p3) + X3(p2) + p1 + p0 + q0) L
7038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.u16      q9, q9, q11           @(X2(p3) + X3(p2) + p1 + p0 + q0) H
7048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q15, q6, q4           @Aq = abs(q2-q0)
7058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q11, q5, q4           @ABS(Q1-Q0)
7068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d16, q8, #3           @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2'
7078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d17, q9, #3           @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2'
7088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q9, q2, q3            @ABS(p1-p0)
7098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcgt.u8       q15, q13, q15         @Aq < Beta
7108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       q11, q11, q13         @ABS(q1 - q0) >= Beta
7118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       q9, q9, q13           @ABS(p1 - p0) >= beta
7128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.i8       q13, r2               @duplicate alpha
7138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand          q15, q15, q14         @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
7148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       q14, q3, q4           @abs(p0-q0)
7158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          q11, q11, q9          @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
7168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q9, d6, d8            @p0+q0 L
7178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       q14, q14, q13         @ABS(p0 - q0) >= Alpha
7188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q13, d7, d9           @p0+q0 H
7198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q9, q9, d10           @p0+q0+q1 L
7208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          q11, q11, q14         @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha
7218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q13, q13, d11         @p0+q0+q1 H
7228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          q7, q7, q11           @final condn for p's
7238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.i8       q14, #2
7248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          q3, q12, q11          @final p0
7258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit          q1, q8, q7            @final p2
7268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          q10, q2, q7           @final p1
7278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q12, d8, d4           @q0+p1 L
7288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmlal.u8      q12, d10, d28         @X2(q1) + q0 + p1 L
7298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q8, d9, d5            @q0+p1 H
7308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmlal.u8      q8, d11, d28          @X2(q1) + q0 + p1 H
7318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.i16      q14, #2
7328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q7, d4, d12           @p1+q2 L
7338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmla.u16      q7, q9, q14           @p1 + X2(p0) + X2(q0) + X2(q1) + q2L
7348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q2, d5, d13           @p1+q2H
7358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmla.u16      q2, q13, q14          @p1 + X2(p0) + X2(q0) + X2(q1) + q2H
7368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d24, q12, #2          @(X2(q1) + q0 + p1 + 2) >> 2; L q0'
7378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d25, q8, #2           @(X2(q1) + q0 + p1 + 2) >> 2; H q0'
7388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q9, q9, d12           @p0 + q0 + q1 + q2 L
7398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q13, q13, d13         @p0 + q0 + q1 + q2 H
7408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d16, q7, #3           @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo"
7418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpop          {q7}
7428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d17, q2, #3           @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo"
7438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d4, q9, #2            @p0 + q0 + q1 + q2 + 2)>>2 L q1'
7448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d5, q13, #2           @p0 + q0 + q1 + q2 + 2)>>2 H q1'
7458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit          q12, q8, q15          @q0' or q0"
7468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          q15, q15, q11         @final condn for q's
7478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d0, d2                @row1 &2
7488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit          q5, q2, q15           @final q1
7498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d1, d3                @row9 &10
7508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q8, d12, d14          @q2+q3 L
7518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d20, d6               @row3&row4
7528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q2, d13, d15          @q2+q3 H
7538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d21, d7               @row11 & 12
7548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmla.u16      q9, q8, q14           @X2(q3) + X3(q2) + q1 + q0 + p0 L
7558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d2, d6                @row2 & row4
7568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmla.u16      q13, q2, q14          @X2(q3) + X3(q2) + q1 + q0 + p0 H
7578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d3, d7                @row10 & 12
7588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          q4, q12, q11          @final q0
7598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d0, d20               @row1 & 3
7608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d18, q9, #3           @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L
7618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d1, d21               @row9 & row11
7628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d19, q13, #3          @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H
7638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d8, d10               @row5&6
7648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit          q6, q9, q15           @final q2
7658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d9, d11               @row13 &14
7668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d12, d14              @row7 & 8
7678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.8        d13, d15              @row15 & 16
7688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d10, d14              @row6 & row8
7698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d11, d15              @row14 & row16
7708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @now Q3 ->p0 and Q7->q3
7718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d8, d12               @row 5 & 7
7728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d9, d13               @row13 & row15
7738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r0, r0, r1, lsl#4     @restore pointer
7748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d6, d14               @row4 & 8
7758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d7, d15               @row 12 & 16
7768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d0, d8                @row1 & row5
7778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d1, d9                @row9 & 13
7788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d2, d10               @row2 &6
7798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d3, d11               @row10&row14
7808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d20, d12              @row3 & 7
7818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d21, d13              @row11 & row15
7828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d0, [r0], r1          @row1
7838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d2, [r0], r1          @row2
7848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d20, [r0], r1         @row3
7858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d6, [r0], r1          @row4
7868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d8, [r0], r1          @row5
7878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d10, [r0], r1         @row6
7888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d12, [r0], r1         @row7
7898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d14, [r0], r1         @row8
7908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d1, [r0], r1          @row9
7918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d3, [r0], r1          @row10
7928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d21, [r0], r1         @row11
7938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d7, [r0], r1          @row12
7948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d9, [r0], r1          @row13
7958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d11, [r0], r1         @row14
7968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d13, [r0], r1         @row15
7978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.8        d15, [r0], r1         @row16
7988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpop          {d8 - d15}
7998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldmfd         sp!, {r12, pc}
8008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8037497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@**
8048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*******************************************************************************
8058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
8068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @brief
8078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*     Performs filtering of a luma block vertical edge when the
8088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*     boundary strength is set to 4 on calling twice
8098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
8108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @par Description:
8118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*    This operation is described in  Sec. 8.7.2.4 under the title
8128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
8138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
8148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r0 - pu1_src
8158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Pointer to the src sample q0
8168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
8178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r1 - src_strd
8188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Source stride
8198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
8208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r2 - alpha
8218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Alpha Value for the boundary
8228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
8238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r3 - beta
8248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Beta Value for the boundary
8258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
8268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @returns
8278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  None
8288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
8298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @remarks
8308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  None
8318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
8328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*******************************************************************************
8337497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@*
8348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_deblk_luma_vert_bs4_mbaff_a9
8368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_deblk_luma_vert_bs4_mbaff_a9:
8388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    stmfd         sp!, {lr}
8408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r0, r0, #4            @pointer uc_edgePixel-4
8428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpush         {d8 - d15}
8438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @loading [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] for every row
8448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
8458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
8468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
8478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
8488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
8498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
8508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
8518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
8528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vuzp.8        d0, d1                @D0->p3, D1->p2
8548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vuzp.8        d2, d3                @D2->p1, D3->p0
8558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vuzp.8        d4, d5                @D4->q0, D5->q1
8568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vuzp.8        d6, d7                @D6->q2, D7->q3
8578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.i16      q14, #2
8598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q4, d3, d4            @p0+q0
8608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q5, q4, d2            @p0+q0+p1
8618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q6, d1, d5            @p2+q1
8628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmla.u16      q6, q5, q14           @p2 + X2(p1) + X2(p0) + X2(q0) + q1
8638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.i8       d14, #2
8658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q4, q5, d1            @p0+q0+p1+p2
8668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.i8       d15, r2               @duplicate alpha
8678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d10, q4, #2           @(p2 + p1 + p0 + q0 + 2) >> 2) p1'
8688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       d11, d3, d4           @ABD(p0-q0)
8698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsra.u8       d14, d15, #2          @alpha >>2 +2
8708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       d15, d1, d3           @Ap = ABD(p2-p0)
8718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d12, q6, #3           @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) p0'
8728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.i8       d13, r3               @beta
8738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcgt.u8       d14, d14, d11         @ABS(p0 - q0) <((Alpha >>2) + 2)
8748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q8, d3, d5            @p0+q1
8758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcgt.u8       d26, d13, d15         @beta>Ap
8768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q8, q8, d2            @p0+q1+p1
8778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q8, q8, d2            @p0+q1+2*p1
8788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand          d26, d26, d14         @(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
8798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d11, q8, #2           @((X2(p1) + p0 + q1 + 2) >> 2) p0"
8808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          d12, d11, d26         @p0' or p0 "
8818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q9, d1, d0            @p2+p3
8828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.u16      q9, q9, q9            @2*(p2+p3)
8838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.u16      q4, q4, q9            @(X2(p3) + X3(p2) + p1 + p0 + q0)
8848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       d15, d6, d4           @Aq = abs(q2-q0)
8858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       d11, d5, d4           @ABS(q1-q0)
8868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d8, q4, #3            @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); p2'
8878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       d9, d2, d3            @ABS(p1-p0)
8888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcgt.u8       d15, d13, d15         @Aq < Beta
8898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       d11, d11, d13         @ABS(q1 - q0) >= Beta
8908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       d9, d9, d13           @ABS(p1 - p0) >= beta
8918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.i8       d13, r2               @duplicate alpha
8928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand          d15, d15, d14         @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
8938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       d14, d3, d4           @abs(p0-q0)
8948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          d11, d11, d9          @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
8958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       d14, d14, d13         @ABS(p0 - q0) >= Alpha
8968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q10, d3, d4           @p0+q0
8978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          d11, d11, d14         @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha
8988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q10, q10, d5          @p0+q0+q1
8998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          d26, d26, d11         @final condn for p's
9008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.i8       d14, #2
9018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          d3, d12, d11          @final p0
9028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit          d1, d8, d26           @final p2
9038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          d10, d2, d26          @final p1
9048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q6, d4, d2            @q0+p1
9058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmlal.u8      q6, d5, d14           @X2(q1) + q0 + p1
9068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q11, d2, d6           @p1+q2
9088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmla.u16      q11, q10, q14         @p1 + X2(p0) + X2(q0) + X2(q1) + q2
9098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d12, q6, #2           @(X2(q1) + q0 + p1 + 2) >> 2; q0'
9108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q10, q10, d6          @p0 + q0 + q1 + q2
9118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d8, q11, #3           @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 qo"
9128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d2, q10, #2           @p0 + q0 + q1 + q2 + 2)>>2 q1'
9148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit          d12, d8, d15          @q0' or q0"
9158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          d15, d15, d11         @final condn for q's
9168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit          d5, d2, d15           @final q1
9178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q12, d6, d7           @q2+q3
9188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmla.u16      q10, q12, q14         @X2(q3) + X3(q2) + q1 + q0 + p0
9198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          d4, d12, d11          @final q0
9208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshrn.u16    d9, q10, #3           @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3;
9218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit          d6, d9, d15           @final q2
9228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand          d2, d10, d10          @D0->p3, D1->p2, D2->p1, D3->p0, D4->q0, D5->q1, D6->q2, D7->q3
9238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.8        d0, d1                @D0,D1 -> [p3:p2]
9258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.8        d2, d3                @D2,D3 -> [p1:p0]
9268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.8        d4, d5                @D4,D5 -> [q0:q1]
9278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.8        d6, d7                @D6,D7 -> [q2:q3]
9288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r0, r0, r1, lsl#3     @restore pointer
9308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @storing [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] in every row
9328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
9338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
9348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
9358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
9368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
9378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
9388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
9398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
9408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpop          {d8 - d15}
9418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldmfd         sp!, {pc}
9428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9457497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@**
9468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*******************************************************************************
9478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
9488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @brief
9498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*     Performs filtering of a luma block vertical edge for cases where the
9508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*     boundary strength is less than 4 on calling twice
9518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
9528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @par Description:
9538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*    This operation is described in  Sec. 8.7.2.4 under the title
9548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
9558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
9568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r0 - pu1_src
9578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Pointer to the src sample q0
9588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
9598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r1 - src_strd
9608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Source stride
9618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
9628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r2 - alpha
9638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Alpha Value for the boundary
9648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
9658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] r3 - beta
9668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Beta Value for the boundary
9678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
9688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] sp(0) - u4_bs
9698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  Packed Boundary strength array
9708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
9718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @param[in] sp(4) - pu1_cliptab
9728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  tc0_table
9738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
9748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @returns
9758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  None
9768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
9778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@* @remarks
9788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*  None
9798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*
9808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*******************************************************************************
9817497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@*
9828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_deblk_luma_vert_bslt4_mbaff_a9
9848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_deblk_luma_vert_bslt4_mbaff_a9:
9868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    stmfd         sp!, {r12, lr}
9888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r0, r0, #4            @pointer uc_edgePixel-4
9908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r12, [sp, #8]         @r12 = ui_Bs
9918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r14, [sp, #12]        @r14 = pu1_ClipTab
9928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpush         {d8 - d15}
9938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @loading [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] for every row
9948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
9958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
9968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
9978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
9988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
9998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
10008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
10018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.16       {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
10028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vuzp.8        d0, d1                @D0->p3, D1->p2
10048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vuzp.8        d2, d3                @D2->p1, D3->p0
10058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vuzp.8        d4, d5                @D4->q0, D5->q1
10068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vuzp.8        d6, d7                @D6->q2, D7->q3
10078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    rev           r12, r12              @reversing ui_bs
10098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.32       d8[0], r12            @D8[0] = ui_Bs
10108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d9[0], [r14]          @D9[0] contains cliptab
10118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovl.u8      q15, d8               @D30 = ui_Bs in each 16 bt scalar
10128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtbl.8        d8, {d9}, d30         @puc_ClipTab[ui_Bs]
10138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsli.16       d8, d8, #8            @D8 = C0
10148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrhadd.u8     d10, d3, d4           @((p0 + q0 + 1) >> 1)
10168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.i8       d31, #2
10178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       d11, d3, d4           @ABS(p0 - q0)
10188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q6, d10, d1           @(p2 + ((p0 + q0 + 1) >> 1)
10198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmlsl.u8      q6, d2, d31           @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1))
10208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.8        d14, r2               @alpha
10218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcle.u8       d11, d14, d11         @ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
10228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.i8       d14, r3               @beta
10238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       d15, d5, d4           @ABS(q1 - q0)
10248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqshrn.s16    d12, q6, #1           @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1)
10258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       d15, d15, d14         @ABS(q1 - q0) >= Beta
10268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       d13, d2, d3           @ABS(p1 - p0)
10278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmin.s8       d12, d12, d8          @min(deltap1 ,C0)
10288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          d11, d11, d15         @ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
10298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vneg.s8       d15, d8               @-C0
10308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.u8       d13, d13, d14         @ABS(p1 - p0) >= Beta
10318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmax.s8       d12, d12, d15         @max(deltap1,-C0)
10328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          d11, d11, d13         @ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta ||  ABS(p1 - p0) >= Beta)
10338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vceq.u16      d13, d30, #0          @ui_bs == 0
10348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.u8      q14, d10, d6          @q2 + ((p0 + q0 + 1) >> 1)
10358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.u8      q14, q14, d5          @q2 + ((p0 + q0 + 1) >> 1) - q1
10368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.u8      q14, q14, d5          @q2 + ((p0 + q0 + 1) >> 1) - 2*q1
10378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vorr          d13, d13, d11         @(ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
10388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                        @|| (ui_bs == 0)
10398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqshrn.s16    d9, q14, #1           @(q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1
10408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       d11, d1, d3           @Ap = ABS(p2 - p0)
10418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabd.u8       d10, d6, d4           @Aq= ABS(q2 - q0)
10428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vclt.u8       d11, d11, d14         @Ap < Beta
10438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmin.s8       d9, d9, d8            @min(deltaq1,C0)
10448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vclt.u8       d10, d10, d14         @Aq < Beta
10458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmax.s8       d9, d9, d15           @max(deltaq1,-C0)
10468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.u8      q7, d4, d3            @q0 - p0
10478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s16      q7, q7, #2            @(q0 - p0) << 2
10488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.u8       d8, d8, d11           @C0 + (Ap < Beta)
10498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q7, q7, d2            @((q0 - p0) << 2) + p1
10508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.u8      q7, q7, d5            @((q0 - p0) << 2) + (p1 - q1)
10518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          d11, d11, d13         @final condition for p1
10528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q15, q7, #3           @delta = (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3
10538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.u8       d8, d8, d10           @C0 + (Ap < Beta) + (Aq < Beta)
10548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          d10, d10, d13         @final condition for q1
10558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vabs.s16      q14, q15
10568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovn.i16     d15, q14              @abs(delta)
10578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand          d12, d12, d11         @delatp1
10588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vand          d9, d9, d10           @deltaq1
10598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmin.u8       d15, d15, d8          @min((abs(delta),C)
10608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i8       d2, d2, d12           @p1+deltap1
10618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.i8       d5, d5, d9            @q1+deltaq1
10628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbic          d15, d15, d13         @abs(delta) of pixels to be changed only
10638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vcge.s16      q14, q15, #0
10648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovn.i16     d14, q14              @sign(delta)
10658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqsub.u8      d11, d3, d15          @clip(p0-delta)
10668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqadd.u8      d3, d3, d15           @clip(p0+delta)
10678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqadd.u8      d12, d4, d15          @clip(q0+delta)
10688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqsub.u8      d4, d4, d15           @clip(q0-delta)
10698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          d3, d11, d14          @p0
10708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbif          d4, d12, d14          @q0
10718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r0, r0, r1, lsl#3     @restore pointer
10738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                        @D0->p3, D1->p2, D2->p1, D3->p0, D4->q0, D5->q1, D6->q2, D7->q3
10748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.8        d0, d1                @D0,D1 -> [p3:p2]
10758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.8        d2, d3                @D2,D3 -> [p1:p0]
10768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.8        d4, d5                @D4,D5 -> [q0:q1]
10778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.8        d6, d7                @D6,D7 -> [q2:q3]
10788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    @storing [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] in every row
10808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
10818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
10828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
10838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
10848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
10858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
10868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
10878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst4.16       {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
10888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpop          {d8 - d15}
10898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldmfd         sp!, {r12, pc}
10908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1093