1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///*****************************************************************************/
21///*                                                                           */
22///*  File Name         : ih264_deblk_luma_av8.s                               */
23///*                                                                           */
24///*  Description       : Contains function definitions for deblocking luma    */
25///*                      edge. Functions are coded in NEON assembly and can   */
26///*                      be compiled using ARM RVDS.                          */
27///*                                                                           */
28///*  List of Functions : ih264_deblk_luma_vert_bs4_av8()                      */
29///*                      ih264_deblk_luma_vert_bslt4_av8()                    */
30///*                      ih264_deblk_luma_horz_bs4_av8()                      */
31///*                      ih264_deblk_luma_horz_bslt4_av8()                    */
32///*                                                                           */
33///*  Issues / Problems : None                                                 */
34///*                                                                           */
35///*  Revision History  :                                                      */
36///*                                                                           */
37///*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
38///*         28 11 2013   Ittiam          Draft                                */
39///*                                                                           */
40///*****************************************************************************/
41
42
43.text
44.p2align 2
45.include "ih264_neon_macros.s"
46
47
48
49///**
50//*******************************************************************************
51//*
52//* @brief
53//*     Performs filtering of a luma block horizontal edge for cases where the
54//*     boundary strength is less than 4
55//*
56//* @par Description:
57//*       This operation is described in  Sec. 8.7.2.4 under the title
58//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
59//*
60//* @param[in] x0 - pu1_src
61//*  Pointer to the src sample q0
62//*
63//* @param[in] x1 - src_strd
64//*  Source stride
65//*
66//* @param[in] x2 - alpha
67//*  Alpha Value for the boundary
68//*
69//* @param[in] x3 - beta
70//*  Beta Value for the boundary
71//*
72//* @param[in] sp(0) - u4_bs
73//*    Packed Boundary strength array
74//*
75//* @param[in] sp(4) - pu1_cliptab
76//*    tc0_table
77//*
78//* @returns
79//*  None
80//*
81//* @remarks
82//*  None
83//*
84//*******************************************************************************
85//*/
86
87    .global ih264_deblk_luma_horz_bslt4_av8
88
89ih264_deblk_luma_horz_bslt4_av8:
90
91    // STMFD sp!,{x4-x7,x14}
92    push_v_regs
93    stp       x19, x20, [sp, #-16]!
94
95    //LDRD            x4,x5,[SP,#0x14]        //x4 = ui_Bs , x5 = *puc_ClpTab
96    sub       x0, x0, x1, lsl #1        //x1 = uc_Horizonpad
97    sub       x0, x0, x1                //x0 pointer to p2
98    rev       w4, w4                    //
99    ld1       {v10.8b, v11.8b}, [x0], x1 //p2 values are loaded into q5
100    mov       v12.s[0], w4              //d12[0] = ui_Bs
101    mov       x6, x0                    //keeping backup of pointer to p1
102    ld1       {v8.8b, v9.8b}, [x0], x1  //p1 values are loaded into q4
103    mov       x7, x0                    //keeping backup of pointer to p0
104    ld1       {v6.8b, v7.8b}, [x0], x1  //p0 values are loaded into q3
105    uxtl      v12.8h, v12.8b            //q6 = uc_Bs in each 16 bt scalar
106    ld1       {v0.8b, v1.8b}, [x0], x1  //q0 values are loaded into q0
107    mov       v10.d[1], v11.d[0]
108    mov       v8.d[1], v9.d[0]
109    mov       v6.d[1], v7.d[0]
110    uabd      v26.16b, v8.16b, v6.16b
111    ld1       {v2.8b, v3.8b}, [x0], x1  //q1 values are loaded into q1
112    mov       v0.d[1], v1.d[0]
113    mov       v2.d[1], v3.d[0]
114    uabd      v22.16b, v6.16b, v0.16b
115    ld1       {v16.s}[0], [x5]          //D16[0] contains cliptab
116    uabd      v24.16b, v2.16b, v0.16b
117    ld1       {v4.8b, v5.8b}, [x0], x1  //q2 values are loaded into q2
118    tbl       v14.8b, {v16.16b}, v12.8b //
119    mov       v4.d[1], v5.d[0]
120    dup       v20.16b, w2               //Q10 contains alpha
121    dup       v16.16b, w3               //Q8 contains beta
122    uxtl      v12.4s, v12.4h            //
123    uxtl      v14.4s, v14.4h            //
124    uabd      v28.16b, v10.16b, v6.16b
125    uabd      v30.16b, v4.16b, v0.16b
126    cmgt      v12.4s, v12.4s, #0
127    sli       v14.4s, v14.4s, #8
128    cmhs      v18.16b, v22.16b, v20.16b
129    cmhs      v24.16b, v24.16b, v16.16b
130    cmhs      v26.16b, v26.16b, v16.16b
131    cmhi      v20.16b, v16.16b , v28.16b //Q10=(Ap<Beta)
132    cmhi      v22.16b, v16.16b , v30.16b //Q11=(Aq<Beta)
133    sli       v14.4s, v14.4s, #16
134    orr       v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
135    usubl     v30.8h, v1.8b, v7.8b      //
136    usubl     v24.8h, v0.8b, v6.8b      //Q15,Q12 = (q0 - p0)
137    orr       v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
138    usubl     v28.8h, v8.8b, v2.8b      //Q14 = (p1 - q1)L
139    shl       v26.8h, v30.8h, #2        //Q13 = (q0 - p0)<<2
140    shl       v24.8h, v24.8h, #2        //Q12 = (q0 - p0)<<2
141    usubl     v30.8h, v9.8b, v3.8b      //Q15 = (p1 - q1)H
142    bic       v12.16b, v12.16b , v18.16b //final condition
143    add       v24.8h, v24.8h , v28.8h   //
144    add       v26.8h, v26.8h , v30.8h   //Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1)
145    sub       v18.16b, v14.16b , v20.16b //Q9 = C0 + (Ap < Beta)
146    urhadd    v16.16b, v6.16b , v0.16b  //Q8 = ((p0+q0+1) >> 1)
147    mov       v17.d[0], v16.d[1]
148    sqrshrn   v24.8b, v24.8h, #3        //
149    sqrshrn   v25.8b, v26.8h, #3        //Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
150    mov       v24.d[1], v25.d[0]
151    sub       v18.16b, v18.16b , v22.16b //Q9 = C0 + (Ap < Beta) + (Aq < Beta)
152    and       v20.16b, v20.16b , v12.16b //
153    and       v22.16b, v22.16b , v12.16b //
154    abs       v26.16b, v24.16b          //Q13 = ABS (i_macro)
155    uaddl     v28.8h, v17.8b, v11.8b    //
156    uaddl     v10.8h, v16.8b, v10.8b    //Q14,Q5 = p2 + (p0+q0+1)>>1
157    uaddl     v30.8h, v17.8b, v5.8b     //
158    umin      v18.16b, v26.16b , v18.16b //Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
159    ushll     v26.8h, v9.8b, #1         //
160    uaddl     v4.8h, v16.8b, v4.8b      //Q15,Q2 = q2 + (p0+q0+1)>>1
161    ushll     v16.8h, v8.8b, #1         //Q13,Q8 = (p1<<1)
162    and       v18.16b, v18.16b , v12.16b //Making delta zero in places where values shouldn be filterd
163    sub       v28.8h, v28.8h , v26.8h   //Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1)
164    sub       v10.8h, v10.8h , v16.8h   //
165    ushll     v16.8h, v2.8b, #1         //
166    ushll     v26.8h, v3.8b, #1         //Q13,Q8 = (q1<<1)
167    sqshrn    v29.8b, v28.8h, #1        //
168    sqshrn    v28.8b, v10.8h, #1        //Q14 = i_macro_p1
169    mov       v28.d[1], v29.d[0]
170    sub       v4.8h, v4.8h , v16.8h     //
171    sub       v30.8h, v30.8h , v26.8h   //Q15,Q2  = [q2 + (p0+q0+1)>>1] - (q1<<1)
172    neg       v26.16b, v14.16b          //Q13 = -C0
173    smin      v28.16b, v28.16b , v14.16b //Q14 = min(C0,i_macro_p1)
174    cmge      v24.16b, v24.16b, #0
175    sqshrn    v31.8b, v30.8h, #1        //
176    sqshrn    v30.8b, v4.8h, #1         //Q15 = i_macro_q1
177    mov       v30.d[1], v31.d[0]
178    smax      v28.16b, v28.16b , v26.16b //Q14 = max( - C0 , min(C0, i_macro_p1) )
179    uqadd     v16.16b, v6.16b , v18.16b //Q8  = p0 + delta
180    uqsub     v6.16b, v6.16b , v18.16b  //Q3 = p0 - delta
181    smin      v30.16b, v30.16b , v14.16b //Q15 = min(C0,i_macro_q1)
182    and       v28.16b, v20.16b , v28.16b //condition check Ap<beta
183    uqadd     v14.16b, v0.16b , v18.16b //Q7 = q0 + delta
184    uqsub     v0.16b, v0.16b , v18.16b  //Q0   = q0 - delta
185    smax      v30.16b, v30.16b , v26.16b //Q15 = max( - C0 , min(C0, i_macro_q1) )
186    bif       v16.16b, v6.16b , v24.16b //Q8  = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
187    bif       v0.16b, v14.16b , v24.16b //Q0  = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
188    add       v28.16b, v28.16b , v8.16b //
189    and       v30.16b, v22.16b , v30.16b //condition check Aq<beta
190    st1       {v16.16b}, [x7], x1       //writting back filtered value of p0
191    add       v30.16b, v30.16b , v2.16b //
192    st1       {v0.16b}, [x7], x1        //writting back filtered value of q0
193    st1       {v28.16b}, [x6]           //writting back filtered value of p1
194    st1       {v30.16b}, [x7], x1       //writting back filtered value of q1
195
196    // LDMFD sp!,{x4-x7,pc}
197    ldp       x19, x20, [sp], #16
198    pop_v_regs
199    ret
200
201
202
203///**
204//*******************************************************************************
205//*
206//* @brief
207//*     Performs filtering of a luma block horizontal edge when the
208//*     boundary strength is set to 4
209//*
210//* @par Description:
211//*       This operation is described in  Sec. 8.7.2.4 under the title
212//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
213//*
214//* @param[in] x0 - pu1_src
215//*  Pointer to the src sample q0
216//*
217//* @param[in] x1 - src_strd
218//*  Source stride
219//*
220//* @param[in] x2 - alpha
221//*  Alpha Value for the boundary
222//*
223//* @param[in] x3 - beta
224//*  Beta Value for the boundary
225//*
226//* @returns
227//*  None
228//*
229//* @remarks
230//*  None
231//*
232//*******************************************************************************
233//*/
234
235    .global ih264_deblk_luma_horz_bs4_av8
236
237ih264_deblk_luma_horz_bs4_av8:
238
239    // Back up necessary registers on stack
240    // STMFD sp!,{x12,x14}
241    push_v_regs
242    stp       x19, x20, [sp, #-16]!
243
244    // Init
245    dup       v0.16b, w2                //duplicate alpha
246    sub       x12, x0, x1               //pointer to p0 = q0 - src_strd
247    dup       v2.16b, w3                //duplicate beta
248    sub       x14, x0, x1, lsl#1        //pointer to p1 = q0 - src_strd*2
249    sub       x2, x0, x1, lsl#2         //pointer to p3 = q0 - src_strd*4
250    sub       x3, x14, x1               //pointer to p2 = p1 - src_strd
251
252    // Load Data
253    ld1       {v4.8b, v5.8b}, [x0], x1  //load q0 to Q2, q0 = q0 + src_strd
254    ld1       {v6.8b, v7.8b}, [x12]     //load p0 to Q3
255    ld1       {v8.8b, v9.8b}, [x0], x1  //load q1 to Q4, q0 = q0 + src_strd
256    ld1       {v10.8b, v11.8b}, [x14]   //load p1 to Q5
257    mov       v4.d[1] , v5.d[0]
258    mov       v6.d[1] , v7.d[0]
259    mov       v8.d[1] , v9.d[0]
260    mov       v10.d[1] , v11.d[0]
261
262    // Filter Decision
263    uabd      v12.16b  , v4.16b, v6.16b
264    uabd      v14.16b  , v8.16b, v4.16b
265    uabd      v16.16b  , v10.16b, v6.16b
266    cmhs      v18.16b, v12.16b , v0.16b //ABS(p0 - q0) >= Alpha
267    cmhs      v14.16b, v14.16b , v2.16b //ABS(q1 - q0) >= Beta
268    cmhs      v16.16b, v16.16b , v2.16b //ABS(q1 - q0) >= Beta
269    movi      v20.16b, #2
270    orr       v18.16b, v18.16b , v14.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta
271    ld1       {v14.8b, v15.8b}, [x0], x1 //load q2 to Q7, q0 = q0 + src_strd
272    mov       v14.d[1] , v15.d[0]
273    orr       v18.16b, v18.16b , v16.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta
274    usra      v20.16b, v0.16b, #2       //alpha >>2 +2
275    uabd      v22.16b  , v14.16b, v4.16b
276    uaddl     v24.8h, v4.8b, v6.8b      //p0+q0 L
277    uaddl     v26.8h, v5.8b, v7.8b      //p0+q0 H
278    cmhi      v22.16b, v2.16b , v22.16b //Aq < Beta
279    cmhi      v20.16b, v20.16b , v12.16b //(ABS(p0 - q0) <((Alpha >>2) + 2))
280    // Deblock Filtering q0', q1', q2'
281    uaddw     v28.8h, v24.8h , v8.8b    //p0+q0+q1 L
282    uaddw     v30.8h, v26.8h , v9.8b    //p0+q0+q1 H
283    and       v22.16b, v22.16b , v20.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
284    // q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE
285    add       v16.8h, v28.8h , v28.8h   //2*(p0+q0+q1)L
286    add       v0.8h, v30.8h , v30.8h    //2*(p0+q0+q1)H
287    uaddw     v16.8h, v16.8h , v14.8b   //2*(p0+q0+q1)+q2 L
288    uaddw     v0.8h, v0.8h , v15.8b     //2*(p0+q0+q1)+q2 H
289    uaddw     v16.8h, v16.8h , v10.8b   //2*(p0+q0+q1)+q2 +p1 L
290    uaddw     v0.8h, v0.8h , v11.8b     //2*(p0+q0+q1)+q2 +p1 H
291    rshrn     v12.8b, v16.8h, #3        //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0']
292    rshrn     v13.8b, v0.8h, #3         //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0']
293    mov       v12.d[1] , v13.d[0]
294    // q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE
295    uaddl     v16.8h, v8.8b, v8.8b      //2*q1 L
296    uaddl     v0.8h, v9.8b, v9.8b       //2*q1 H
297    uaddw     v16.8h, v16.8h , v4.8b    //2*q1+q0 L
298    uaddw     v0.8h, v0.8h , v5.8b      //2*q1+q0 H
299    uaddw     v16.8h, v16.8h , v10.8b   //2*q1+q0+p1  L
300    uaddw     v0.8h, v0.8h , v11.8b     //2*q1+q0+p1 H
301    rshrn     v16.8b, v16.8h, #2        //(2*q1+q0+p1+2)>>2 L [q0"]
302    rshrn     v17.8b, v0.8h, #2         //(2*q1+q0+p1+2)>>2 H [q0"]
303    mov       v16.d[1] , v17.d[0]
304    uaddw     v28.8h, v28.8h , v14.8b   //p0+q0+q1+q2 L
305    uaddw     v30.8h, v30.8h , v15.8b   //p0+q0+q1+q2 H
306    ld1       {v0.8b, v1.8b}, [x0], x1  //load q3 to Q0, q0 = q0 + src_strd
307    mov       v0.d[1] , v1.d[0]
308    bit       v16.16b, v12.16b , v22.16b //choosing between q0' and q0" depending on condn
309    sub       x0, x0, x1, lsl #2        //pointer to q0
310    bic       v22.16b, v22.16b , v18.16b //((ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
311                                        // && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
312    rshrn     v12.8b, v28.8h, #2        //(p0+q0+q1+q2+2)>>2 L [q1']
313    rshrn     v13.8b, v30.8h, #2        //(p0+q0+q1+q2+2)>>2 H [q1']
314    mov       v12.d[1] , v13.d[0]
315    bif       v4.16b, v16.16b , v18.16b //choose q0 or filtered q0
316    mov       v5.d[0] , v4.d[1]
317    uaddl     v16.8h, v14.8b, v0.8b     //q2+q3,L
318    uaddl     v0.8h, v15.8b, v1.8b      //q2+q3,H
319    add       v28.8h, v28.8h , v16.8h   //p0+q0+q1+2*q2+q3 L
320    st1       {v4.8b, v5.8b}, [x0], x1  //store q0
321    add       v30.8h, v30.8h , v0.8h    //p0+q0+q1+2*q2+q3 H
322    add       v28.8h, v28.8h , v16.8h   //p0+q0+q1+3*q2+2*q3 L
323    add       v30.8h, v30.8h , v0.8h    //p0+q0+q1+3*q2+2*q3 H
324    rshrn     v0.8b, v28.8h, #3         //(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2']
325    rshrn     v1.8b, v30.8h, #3         //(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2']
326    mov       v0.d[1] , v1.d[0]
327    ld1       {v30.8b, v31.8b}, [x3]    //load p2 to Q15
328    mov       v30.d[1] , v31.d[0]
329    bif       v12.16b, v8.16b , v22.16b //choose q1 or filtered value of q1
330    mov       v13.d[0] , v12.d[1]
331    uabd      v16.16b  , v30.16b, v6.16b
332    uaddw     v24.8h, v24.8h , v10.8b   //p0+q0+p1 L
333    bif       v0.16b, v14.16b , v22.16b //choose q2 or filtered q2
334    mov       v1.d[0] , v0.d[1]
335    uaddw     v26.8h, v26.8h , v11.8b   //p0+q0+p1 H
336    st1       {v12.8b, v13.8b}, [x0], x1 //store q1
337    cmhi      v16.16b, v2.16b , v16.16b //Ap < Beta
338    add       v28.8h, v24.8h , v24.8h   //2*(p0+q0+p1) L
339    add       v4.8h, v26.8h , v26.8h    //2*(p0+q0+p1) H
340    st1       {v0.8b, v1.8b}, [x0], x1  //store q2
341    and       v20.16b, v20.16b , v16.16b //((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2)))
342    uaddw     v28.8h, v28.8h , v30.8b   //2*(p0+q0+p1)+p2 l
343    uaddw     v4.8h, v4.8h , v31.8b     //2*(p0+q0+p1)+p2 H
344    uaddw     v28.8h, v28.8h , v8.8b    //2*(p0+q0+p1)+p2+q1 L
345    uaddw     v4.8h, v4.8h , v9.8b      //2*(p0+q0+p1)+p2+q1 H
346    rshrn     v28.8b, v28.8h, #3        //(2*(p0+q0+p1)+p2+q1+4)>>3  L,p0'
347    rshrn     v29.8b, v4.8h, #3         //(2*(p0+q0+p1)+p2+q1+4)>>3  H,p0'
348    mov       v28.d[1] , v29.d[0]
349    movi      v0.8b, #2
350    movi      v1.4h, #2
351    uaddl     v2.8h, v6.8b, v8.8b       //p0+q1      L
352    umlal     v2.8h, v10.8b, v0.8b      //2*p1+p0+q1 L
353    uaddl     v16.8h, v7.8b, v9.8b      //p0+q1  H
354    umlal     v16.8h, v11.8b, v0.8b     //2*p1+p0+q1 H
355    uaddw     v12.8h, v24.8h , v30.8b   //(p0+q0+p1) +p2 L
356    ld1       {v24.8b, v25.8b}, [x2]    //load p3,Q12
357    mov       v24.d[1] , v25.d[0]
358    uaddw     v4.8h, v26.8h , v31.8b    //(p0+q0+p1) +p2 H
359    uaddl     v8.8h, v30.8b, v24.8b     //p2+p3 L
360    rshrn     v26.8b, v12.8h, #2        //((p0+q0+p1)+p2 +2)>>2,p1' L
361    rshrn     v2.8b, v2.8h, #2          //(2*p1+p0+q1+2)>>2,p0"L
362    rshrn     v27.8b, v4.8h, #2         //((p0+q0+p1)+p2 +2)>>2,p1' H
363    rshrn     v3.8b, v16.8h, #2         //(2*p1+p0+q1+2)>>2,p0" H
364    mov       v26.d[1] , v27.d[0]
365    mov       v2.d[1] , v3.d[0]
366    uaddl     v16.8h, v31.8b, v25.8b    //p2+p3 H
367    mla       v12.8h, v8.8h , v1.h[0]   //(p0+q0+p1)+3*p2+2*p3 L
368    mla       v4.8h, v16.8h , v1.h[0]   //(p0+q0+p1)+3*p2+2*p3 H
369    bic       v16.16b, v20.16b , v18.16b //((ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
370    mov       v17.d[0] , v16.d[1]       //&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
371    bit       v2.16b, v28.16b , v20.16b //choosing between po' and p0"
372    mov       v3.d[0] , v2.d[1]
373    rshrn     v12.8b, v12.8h, #3        //((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2'
374    rshrn     v13.8b, v4.8h, #3         //((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2'
375    mov       v12.d[1] , v13.d[0]
376    bif       v6.16b, v2.16b , v18.16b  //choosing between p0 and filtered value of p0
377    bit       v10.16b, v26.16b , v16.16b //choosing between p1 and p1'
378    bit       v30.16b, v12.16b , v16.16b //choosing between p2 and p2'
379    st1       {v6.16b}, [x12]           //store p0
380    st1       {v10.16b}, [x14]          //store p1
381    st1       {v30.16b}, [x3]           //store p2
382
383    // LDMFD sp!,{x12,pc}
384    ldp       x19, x20, [sp], #16
385    pop_v_regs
386    ret
387
388
389
390///**
391//*******************************************************************************
392//*
393//* @brief
394//*     Performs filtering of a luma block vertical edge for cases where the
395//*     boundary strength is less than 4
396//*
397//* @par Description:
398//*       This operation is described in  Sec. 8.7.2.4 under the title
399//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
400//*
401//* @param[in] x0 - pu1_src
402//*  Pointer to the src sample q0
403//*
404//* @param[in] x1 - src_strd
405//*  Source stride
406//*
407//* @param[in] x2 - alpha
408//*  Alpha Value for the boundary
409//*
410//* @param[in] x3 - beta
411//*  Beta Value for the boundary
412//*
413//* @param[in] sp(0) - u4_bs
414//*    Packed Boundary strength array
415//*
416//* @param[in] sp(4) - pu1_cliptab
417//*    tc0_table
418//*
419//* @returns
420//*  None
421//*
422//* @remarks
423//*  None
424//*
425//*******************************************************************************
426//*/
427
428    .global ih264_deblk_luma_vert_bslt4_av8
429
430ih264_deblk_luma_vert_bslt4_av8:
431
432    // STMFD sp!,{x12,x14}
433    push_v_regs
434    stp       x19, x20, [sp, #-16]!
435
436    sub       x0, x0, #4                //pointer uc_edgePixel-4
437    mov       x12, x4
438    mov       x14, x5
439    mov       x17, x0
440    //loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
441    ld1       {v0.8b}, [x0], x1         //row1
442    ld1       {v2.8b}, [x0], x1         //row2
443    ld1       {v4.8b}, [x0], x1         //row3
444    rev       w12, w12                  //reversing ui_bs
445    ld1       {v6.8b}, [x0], x1         //row4
446    mov       v18.s[0], w12             //d12[0] = ui_Bs
447    ld1       {v16.s}[0], [x14]         //D16[0] contains cliptab
448    ld1       {v8.8b}, [x0], x1         //row5
449    uxtl      v18.8h, v18.8b            //q6 = uc_Bs in each 16 bt scalar
450    ld1       {v10.8b}, [x0], x1        //row6
451    ld1       {v12.8b}, [x0], x1        //row7
452    tbl       v16.8b, {v16.16b}, v18.8b //puc_ClipTab[uc_Bs]
453    ld1       {v14.8b}, [x0], x1        //row8
454    ld1       {v1.8b}, [x0], x1         //row9
455    uxtl      v16.4s, v16.4h            //
456    ld1       {v3.8b}, [x0], x1         //row10
457    ld1       {v5.8b}, [x0], x1         //row11
458    ld1       {v7.8b}, [x0], x1         //row12
459    sli       v16.4s, v16.4s, #8        //
460    ld1       {v9.8b}, [x0], x1         //row13
461    ld1       {v11.8b}, [x0], x1        //row14
462    ld1       {v13.8b}, [x0], x1        //row15
463    sli       v16.4s, v16.4s, #16
464    ld1       {v15.8b}, [x0], x1        //row16
465
466
467    //taking two 8x8 transposes
468    //2X2 transposes
469    trn1      v21.8b, v0.8b, v2.8b
470    trn2      v2.8b, v0.8b, v2.8b       //row1 &2
471    mov       v0.8b, v21.8b
472    trn1      v21.8b, v4.8b, v6.8b
473    trn2      v6.8b, v4.8b, v6.8b       //row3&row4
474    mov       v4.8b, v21.8b
475    trn1      v21.8b, v8.8b, v10.8b
476    trn2      v10.8b, v8.8b, v10.8b     //row5&6
477    mov       v8.8b, v21.8b
478    trn1      v21.8b, v12.8b, v14.8b
479    trn2      v14.8b, v12.8b, v14.8b    //row7 & 8
480    mov       v12.8b, v21.8b
481    trn1      v21.8b, v1.8b, v3.8b
482    trn2      v3.8b, v1.8b, v3.8b       //row9 &10
483    mov       v1.8b, v21.8b
484    trn1      v21.8b, v5.8b, v7.8b
485    trn2      v7.8b, v5.8b, v7.8b       //row11 & 12
486    mov       v5.8b, v21.8b
487    trn1      v21.8b, v9.8b, v11.8b
488    trn2      v11.8b, v9.8b, v11.8b     //row13 &14
489    mov       v9.8b, v21.8b
490    trn1      v21.8b, v13.8b, v15.8b
491    trn2      v15.8b, v13.8b, v15.8b    //row15 & 16
492    mov       v13.8b, v21.8b
493    //4x4 transposes
494    trn1      v21.4h, v2.4h, v6.4h
495    trn2      v6.4h, v2.4h, v6.4h       //row2 & row4
496    mov       v2.8b, v21.8b
497    trn1      v21.4h, v10.4h, v14.4h
498    trn2      v14.4h, v10.4h, v14.4h    //row6 & row8
499    mov       v10.8b, v21.8b
500    trn1      v21.4h, v3.4h, v7.4h
501    trn2      v7.4h, v3.4h, v7.4h       //row10 & 12
502    mov       v3.8b, v21.8b
503    trn1      v21.4h, v11.4h, v15.4h
504    trn2      v15.4h, v11.4h, v15.4h    //row14 & row16
505    mov       v11.8b, v21.8b
506    trn1      v21.2s, v6.2s, v14.2s
507    trn2      v14.2s, v6.2s, v14.2s     //row4 & 8
508    mov       v6.8b, v21.8b
509    trn1      v21.2s, v7.2s, v15.2s
510    trn2      v15.2s, v7.2s, v15.2s     //row 12 & 16
511    mov       v7.8b, v21.8b
512    //now Q3 ->p0 and Q7->q3
513    trn1      v21.4h, v0.4h, v4.4h
514    trn2      v4.4h, v0.4h, v4.4h       //row1 & 3
515    mov       v0.8b, v21.8b
516    trn1      v21.4h, v8.4h, v12.4h
517    trn2      v12.4h, v8.4h, v12.4h     //row 5 & 7
518    mov       v8.8b, v21.8b
519    trn1      v21.4h, v1.4h, v5.4h
520    trn2      v5.4h, v1.4h, v5.4h       //row9 & row11
521    mov       v1.8b, v21.8b
522    trn1      v21.4h, v9.4h, v13.4h
523    trn2      v13.4h, v9.4h, v13.4h     //row13 & row15
524    mov       v9.8b, v21.8b
525    trn1      v21.2s, v0.2s, v8.2s
526    trn2      v8.2s, v0.2s, v8.2s       //row1 & row5
527    mov       v0.8b, v21.8b
528    trn1      v21.2s, v1.2s, v9.2s
529    trn2      v9.2s, v1.2s, v9.2s       //row9 & 13
530    mov       v1.8b, v21.8b
531    //now Q0->p3 & Q4->q0
532    //starting processing as p0 and q0 are now ready
533    trn1      v21.2s, v2.2s, v10.2s
534    trn2      v10.2s, v2.2s, v10.2s     //row2 &6
535    mov       v2.8b, v21.8b
536    mov       v6.d[1] , v7.d[0]
537    mov       v8.d[1] , v9.d[0]
538    urhadd    v20.16b, v6.16b , v8.16b  //((p0 + q0 + 1) >> 1)
539    mov       v21.d[0], v20.d[1]
540    trn1      v31.2s, v3.2s, v11.2s
541    trn2      v11.2s, v3.2s, v11.2s     //row10&row14
542    mov       v3.8b, v31.8b
543    movi      v19.8b, #2
544    mov       v18.d[1], v19.d[0]
545    //now Q1->p2     & Q5->q1
546    trn1      v31.2s, v4.2s, v12.2s
547    trn2      v12.2s, v4.2s, v12.2s     //row3 & 7
548    mov       v4.8b, v31.8b
549    uabd      v22.16b  , v6.16b, v8.16b //ABS(q1 - q0)
550    trn1      v31.2s, v5.2s, v13.2s
551    trn2      v13.2s, v5.2s, v13.2s     //row11 & row15
552    mov       v5.8b, v31.8b
553    mov       v0.d[1] , v1.d[0]
554    mov       v2.d[1] , v3.d[0]
555    mov       v4.d[1] , v5.d[0]
556    mov       v10.d[1] , v11.d[0]
557    mov       v12.d[1] , v13.d[0]
558    mov       v14.d[1] , v15.d[0]
559    uaddl     v24.8h, v20.8b, v2.8b     //(p2 + ((p0 + q0 + 1) >> 1) L
560    //now            Q2->p1,Q6->q2
561    uaddl     v26.8h, v21.8b, v3.8b     //(p2 + ((p0 + q0 + 1) >> 1) H
562    umlsl     v24.8h, v4.8b, v19.8b     //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L
563    umlsl     v26.8h, v5.8b, v19.8b     //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H
564    dup       v28.16b, w2               //alpha
565    cmhs      v22.16b, v22.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
566    dup       v28.16b, w3               //beta
567    uabd      v30.16b  , v10.16b, v8.16b //ABS(q1 - q0)
568    sqshrn    v24.8b, v24.8h, #1        //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L
569    sqshrn    v25.8b, v26.8h, #1        //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H
570    mov       v24.d[1], v25.d[0]
571    cmhs      v30.16b, v30.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
572    uabd      v26.16b  , v4.16b, v6.16b //ABS(q1 - q0)
573
574    smin      v24.16b, v24.16b , v16.16b //min(deltap1 ,C0)
575    orr       v22.16b, v22.16b , v30.16b //ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
576    neg       v30.16b, v16.16b          //-C0
577    cmhs      v26.16b, v26.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
578    smax      v24.16b, v24.16b , v30.16b //max(deltap1,-C0)
579    orr       v22.16b, v22.16b , v26.16b //ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta ||    ABS(p1 - p0) >= Beta)
580    uxtl      v26.4s, v18.4h            //ui_bs
581    uaddl     v18.8h, v20.8b, v12.8b    //q2 + ((p0 + q0 + 1) >> 1) L
582    cmeq      v26.4s, v26.4s , #0       //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
583    usubw     v18.8h, v18.8h , v10.8b   //(q2 + ((p0 + q0 + 1) >> 1) - q1) L
584    uaddl     v20.8h, v21.8b, v13.8b    //q2 + ((p0 + q0 + 1) >> 1) H
585    usubw     v18.8h, v18.8h , v10.8b   //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L
586    usubw     v20.8h, v20.8h , v11.8b   //(q2 + ((p0 + q0 + 1) >> 1) - q1) H
587    orr       v26.16b, v26.16b , v22.16b //(ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs)
588    usubw     v20.8h, v20.8h , v11.8b   //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H
589    sqshrn    v18.8b, v18.8h, #1        //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L
590    uabd      v22.16b  , v2.16b, v6.16b //ABS(q1 - q0)
591    sqshrn    v19.8b, v20.8h, #1        //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H
592    mov       v18.d[1], v19.d[0]
593    uabd      v20.16b  , v12.16b, v8.16b //ABS(q1 - q0)
594    cmhi      v22.16b, v28.16b , v22.16b //Ap < Beta
595    smin      v18.16b, v18.16b , v16.16b //min(delatq1,C0)
596    cmhi      v20.16b, v28.16b , v20.16b //Aq <Beta
597    usubl     v28.8h, v8.8b, v6.8b      //(q0 - p0) L
598    smax      v18.16b, v18.16b , v30.16b //max(deltaq1,-C0)
599    usubl     v30.8h, v9.8b, v7.8b      //(q0 - p0) H
600    shl       v28.8h, v28.8h, #2        //(q0 - p0)<<2 L
601    sub       v16.16b, v16.16b , v22.16b //C0 + (Ap < Beta)
602    shl       v30.8h, v30.8h, #2        //(q0 - p0) << 2) H
603    uaddw     v28.8h, v28.8h , v4.8b    //((q0 - p0) << 2) + (p1  L
604    uaddw     v30.8h, v30.8h , v5.8b    //((q0 - p0) << 2) + (p1 H
605    usubw     v28.8h, v28.8h , v10.8b   //((q0 - p0) << 2) + (p1 - q1) L
606    usubw     v30.8h, v30.8h , v11.8b   //((q0 - p0) << 2) + (p1 - q1) H
607    bic       v22.16b, v22.16b , v26.16b //final condition for p1
608    rshrn     v28.8b, v28.8h, #3        //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L
609    rshrn     v29.8b, v30.8h, #3        //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H
610    mov       v28.d[1], v29.d[0]
611    sub       v16.16b, v16.16b , v20.16b //C0 + (Ap < Beta) + (Aq < Beta)
612    bic       v20.16b, v20.16b , v26.16b //final condition for q1
613    abs       v30.16b, v28.16b          //abs(delta)
614    and       v24.16b, v24.16b , v22.16b //delatp1
615    and       v18.16b, v18.16b , v20.16b //delta q1
616    umin      v30.16b, v30.16b , v16.16b //min((abs(delta),C)
617    add       v4.16b, v4.16b , v24.16b  //p1+deltap1
618    add       v10.16b, v10.16b , v18.16b //q1+deltaq1
619    mov       v5.d[0], v4.d[1]
620    mov       v11.d[0], v10.d[1]
621    bic       v30.16b, v30.16b , v26.16b //abs(delta) of pixels to be changed only
622    // VCGE.S8 Q14,    Q14,#0                    //sign(delta)
623    cmge      v28.16b, v28.16b , #0
624    uqsub     v22.16b, v6.16b , v30.16b //clip(p0-delta)
625
626    trn1      v21.8b, v0.8b, v2.8b
627    trn2      v2.8b, v0.8b, v2.8b       //row1 &2
628    mov       v0.8b, v21.8b
629    uqadd     v6.16b, v6.16b , v30.16b  //clip(p0+delta)
630
631    trn1      v21.8b, v1.8b, v3.8b
632    trn2      v3.8b, v1.8b, v3.8b       //row9 &10
633    mov       v1.8b, v21.8b
634    uqadd     v24.16b, v8.16b , v30.16b //clip(q0+delta)
635    trn1      v21.8b, v12.8b, v14.8b
636    trn2      v14.8b, v12.8b, v14.8b    //row7 & 8
637    mov       v12.8b, v21.8b
638    uqsub     v8.16b, v8.16b , v30.16b  //clip(q0-delta)
639    trn1      v21.8b, v13.8b, v15.8b
640    trn2      v15.8b, v13.8b, v15.8b    //row15 & 16
641    mov       v13.8b, v21.8b
642    bif       v6.16b, v22.16b , v28.16b //p0
643    bif       v8.16b, v24.16b , v28.16b //q0
644    mov       v7.d[0], v6.d[1]
645    mov       v9.d[0], v8.d[1]
646    trn1      v21.8b, v4.8b, v6.8b
647    trn2      v6.8b, v4.8b, v6.8b       //row3&row4
648    mov       v4.8b, v21.8b
649    trn1      v21.8b, v8.8b, v10.8b
650    trn2      v10.8b, v8.8b, v10.8b     //row5&6
651    mov       v8.8b, v21.8b
652    trn1      v21.8b, v5.8b, v7.8b
653    trn2      v7.8b, v5.8b, v7.8b       //row11 & 12
654    mov       v5.8b, v21.8b
655    trn1      v21.8b, v9.8b, v11.8b
656    trn2      v11.8b, v9.8b, v11.8b     //row13 &14
657    mov       v9.8b, v21.8b
658    trn1      v21.4h, v2.4h, v6.4h
659    trn2      v6.4h, v2.4h, v6.4h       //row2 & row4
660    mov       v2.8b, v21.8b
661    trn1      v21.4h, v10.4h, v14.4h
662    trn2      v14.4h, v10.4h, v14.4h    //row6 & row8
663    mov       v10.8b, v21.8b
664    trn1      v21.4h, v3.4h, v7.4h
665    trn2      v7.4h, v3.4h, v7.4h       //row10 & 12
666    mov       v3.8b, v21.8b
667    trn1      v21.4h, v11.4h, v15.4h
668    trn2      v15.4h, v11.4h, v15.4h    //row14 & row16
669    mov       v11.8b, v21.8b
670    trn1      v21.2s, v6.2s, v14.2s
671    trn2      v14.2s, v6.2s, v14.2s     //row4 & 8
672    mov       v6.8b, v21.8b
673    trn1      v21.2s, v7.2s, v15.2s
674    trn2      v15.2s, v7.2s, v15.2s     //row 12 & 16
675    mov       v7.8b, v21.8b
676    //now Q3 ->p0 and Q7->q3
677    trn1      v21.4h, v0.4h, v4.4h
678    trn2      v4.4h, v0.4h, v4.4h       //row1 & 3
679    mov       v0.8b, v21.8b
680    trn1      v21.4h, v8.4h, v12.4h
681    trn2      v12.4h, v8.4h, v12.4h     //row 5 & 7
682    mov       v8.8b, v21.8b
683    trn1      v21.4h, v1.4h, v5.4h
684    trn2      v5.4h, v1.4h, v5.4h       //row9 & row11
685    mov       v1.8b, v21.8b
686    trn1      v21.4h, v9.4h, v13.4h
687    trn2      v13.4h, v9.4h, v13.4h     //row13 & row15
688    mov       v9.8b, v21.8b
689    sub       x0, x0, x1, lsl#4         //restore pointer
690    trn1      v21.2s, v0.2s, v8.2s
691    trn2      v8.2s, v0.2s, v8.2s       //row1 & row5
692    mov       v0.8b, v21.8b
693    trn1      v21.2s, v1.2s, v9.2s
694    trn2      v9.2s, v1.2s, v9.2s       //row9 & 13
695    mov       v1.8b, v21.8b
696    trn1      v21.2s, v2.2s, v10.2s
697    trn2      v10.2s, v2.2s, v10.2s     //row2 &6
698    mov       v2.8b, v21.8b
699    trn1      v21.2s, v3.2s, v11.2s
700    trn2      v11.2s, v3.2s, v11.2s     //row10&row14
701    mov       v3.8b, v21.8b
702    trn1      v21.2s, v4.2s, v12.2s
703    trn2      v12.2s, v4.2s, v12.2s     //row3 & 7
704    mov       v4.8b, v21.8b
705    trn1      v21.2s, v5.2s, v13.2s
706    trn2      v13.2s, v5.2s, v13.2s     //row11 & row15
707    mov       v5.8b, v21.8b
708    st1       {v0.8b}, [x0], x1         //row1
709    st1       {v2.8b}, [x0], x1         //row2
710    st1       {v4.8b}, [x0], x1         //row3
711    st1       {v6.8b}, [x0], x1         //row4
712    st1       {v8.8b}, [x0], x1         //row5
713    st1       {v10.8b}, [x0], x1        //row6
714    st1       {v12.8b}, [x0], x1        //row7
715    st1       {v14.8b}, [x0], x1        //row8
716    st1       {v1.8b}, [x0], x1         //row9
717    st1       {v3.8b}, [x0], x1         //row10
718    st1       {v5.8b}, [x0], x1         //row11
719    st1       {v7.8b}, [x0], x1         //row12
720    st1       {v9.8b}, [x0], x1         //row13
721    st1       {v11.8b}, [x0], x1        //row14
722    st1       {v13.8b}, [x0], x1        //row15
723    st1       {v15.8b}, [x0], x1        //row16
724
725    // LDMFD sp!,{x12,pc}
726    ldp       x19, x20, [sp], #16
727    pop_v_regs
728    ret
729
730
731
732///**
733//*******************************************************************************
734//*
735//* @brief
736//*     Performs filtering of a luma block vertical edge when the
737//*     boundary strength is set to 4
738//*
739//* @par Description:
740//*       This operation is described in  Sec. 8.7.2.4 under the title
741//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
742//*
743//* @param[in] x0 - pu1_src
744//*  Pointer to the src sample q0
745//*
746//* @param[in] x1 - src_strd
747//*  Source stride
748//*
749//* @param[in] x2 - alpha
750//*  Alpha Value for the boundary
751//*
752//* @param[in] x3 - beta
753//*  Beta Value for the boundary
754//*
755//* @returns
756//*  None
757//*
758//* @remarks
759//*  None
760//*
761//*******************************************************************************
762//*/
763
764    .global ih264_deblk_luma_vert_bs4_av8
765
766ih264_deblk_luma_vert_bs4_av8:
767
768    // STMFD sp!,{x12,x14}
769    push_v_regs
770    stp       x19, x20, [sp, #-16]!
771
772    sub       x0, x0, #4                //pointer uc_edgePixel-4
773    mov       x17, x0
774    //loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
775    ld1       {v0.8b}, [x0], x1         //row1
776    ld1       {v2.8b}, [x0], x1         //row2
777    ld1       {v4.8b}, [x0], x1         //row3
778    ld1       {v6.8b}, [x0], x1         //row4
779    ld1       {v8.8b}, [x0], x1         //row5
780    ld1       {v10.8b}, [x0], x1        //row6
781    ld1       {v12.8b}, [x0], x1        //row7
782    ld1       {v14.8b}, [x0], x1        //row8
783    ld1       {v1.8b}, [x0], x1         //row9
784    ld1       {v3.8b}, [x0], x1         //row10
785    ld1       {v5.8b}, [x0], x1         //row11
786    ld1       {v7.8b}, [x0], x1         //row12
787    ld1       {v9.8b}, [x0], x1         //row13
788    ld1       {v11.8b}, [x0], x1        //row14
789    ld1       {v13.8b}, [x0], x1        //row15
790    ld1       {v15.8b}, [x0], x1        //row16
791
792    //taking two 8x8 transposes
793    //2X2 transposes
794    trn1      v21.8b, v0.8b, v2.8b
795    trn2      v2.8b, v0.8b, v2.8b       //row1 &2
796    mov       v0.8b, v21.8b
797    trn1      v21.8b, v4.8b, v6.8b
798    trn2      v6.8b, v4.8b, v6.8b       //row3&row4
799    mov       v4.8b, v21.8b
800    trn1      v21.8b, v8.8b, v10.8b
801    trn2      v10.8b, v8.8b, v10.8b     //row5&6
802    mov       v8.8b, v21.8b
803    trn1      v21.8b, v12.8b, v14.8b
804    trn2      v14.8b, v12.8b, v14.8b    //row7 & 8
805    mov       v12.8b, v21.8b
806    trn1      v21.8b, v1.8b, v3.8b
807    trn2      v3.8b, v1.8b, v3.8b       //row9 &10
808    mov       v1.8b , v21.8b
809    trn1      v21.8b, v5.8b, v7.8b
810    trn2      v7.8b, v5.8b, v7.8b       //row11 & 12
811    mov       v5.8b , v21.8b
812    trn1      v21.8b, v9.8b, v11.8b
813    trn2      v11.8b, v9.8b, v11.8b     //row13 &14
814    mov       v9.8b , v21.8b
815    trn1      v21.8b, v13.8b, v15.8b
816    trn2      v15.8b, v13.8b, v15.8b    //row15 & 16
817    mov       v13.8b , v21.8b
818    //4x4 transposes
819    trn1      v21.4h, v2.4h, v6.4h
820    trn2      v6.4h, v2.4h, v6.4h       //row2 & row4
821    mov       v2.8b, v21.8b
822    trn1      v21.4h, v10.4h, v14.4h
823    trn2      v14.4h, v10.4h, v14.4h    //row6 & row8
824    mov       v10.8b , v21.8b
825    trn1      v21.4h, v3.4h, v7.4h
826    trn2      v7.4h, v3.4h, v7.4h       //row10 & 12
827    mov       v3.8b, v21.8b
828    trn1      v21.4h, v11.4h, v15.4h
829    trn2      v15.4h, v11.4h, v15.4h    //row14 & row16
830    mov       v11.8b, v21.8b
831    trn1      v21.2s, v6.2s, v14.2s
832    trn2      v14.2s, v6.2s, v14.2s     //row4 & 8
833    mov       v6.8b, v21.8b
834    trn1      v21.2s, v7.2s, v15.2s
835    trn2      v15.2s, v7.2s, v15.2s     //row 12 & 16
836    mov       v7.8b, v21.8b
837    //now Q3 ->p0 and Q7->q3
838    trn1      v21.4h, v0.4h, v4.4h
839    trn2      v4.4h, v0.4h, v4.4h       //row1 & 3
840    mov       v0.8b , v21.8b
841    trn1      v21.4h, v8.4h, v12.4h
842    trn2      v12.4h, v8.4h, v12.4h     //row 5 & 7
843    mov       v8.8b, v21.8b
844    trn1      v21.4h, v1.4h, v5.4h
845    trn2      v5.4h, v1.4h, v5.4h       //row9 & row11
846    mov       v1.8b, v21.8b
847    trn1      v21.4h, v9.4h, v13.4h
848    trn2      v13.4h, v9.4h, v13.4h     //row13 & row15
849    mov       v9.8b , v21.8b
850    trn1      v21.2s, v0.2s, v8.2s
851    trn2      v8.2s, v0.2s, v8.2s       //row1 & row5
852    mov       v0.8b, v21.8b
853    trn1      v21.2s, v1.2s, v9.2s
854    trn2      v9.2s, v1.2s, v9.2s       //row9 & 13
855    mov       v1.8b, v21.8b
856    //now Q0->p3 & Q4->q0
857    //starting processing as p0 and q0 are now ready
858    //now Q1->p2 & Q5->q1
859    mov       v31.d[0], v14.d[0]
860    mov       v31.d[1], v15.d[0]
861    trn1      v21.2s, v4.2s, v12.2s
862    trn2      v12.2s, v4.2s, v12.2s     //row3 & 7
863    mov       v4.8b, v21.8b
864    movi      v28.8h, #2
865    trn1      v21.2s, v5.2s, v13.2s
866    trn2      v13.2s, v5.2s, v13.2s     //row11 & row15
867    mov       v5.8b, v21.8b
868    uaddl     v16.8h, v6.8b, v8.8b      //p0+q0 L
869    trn1      v21.2s, v2.2s, v10.2s
870    trn2      v10.2s, v2.2s, v10.2s     //row2 &6
871    mov       v2.8b, v21.8b
872    uaddl     v18.8h, v7.8b, v9.8b      //p0+q0 H
873    trn1      v21.2s, v3.2s, v11.2s
874    trn2      v11.2s, v3.2s, v11.2s     //row10&row14
875    mov       v3.8b, v21.8b
876    uaddw     v20.8h, v16.8h , v4.8b    //p0+q0+p1 L
877    uaddw     v22.8h, v18.8h , v5.8b    //p0+q0+p1 H
878    uaddl     v24.8h, v2.8b, v10.8b     //p2+q1 L
879    uaddl     v26.8h, v3.8b, v11.8b     //p2+q1 H
880    mla       v24.8h, v20.8h , v28.8h   //p2 + X2(p1) + X2(p0) + X2(q0) + q1 L
881    mla       v26.8h, v22.8h , v28.8h   //p2 + X2(p1) + X2(p0) + X2(q0) + q1 H
882    movi      v28.16b, #2
883    uaddw     v16.8h, v20.8h , v2.8b    //p0+q0+p1+p2 L
884    uaddw     v18.8h, v22.8h , v3.8b    //p0+q0+p1+p2 H
885    dup       v30.16b, w2               //duplicate alpha
886    rshrn     v20.8b, v16.8h, #2        //(p2 + p1 + p0 + q0 + 2) >> 2)L p1'
887    rshrn     v21.8b, v18.8h, #2        //(p2 + p1 + p0 + q0 + 2) >> 2)H p1'
888    mov       v20.d[1] , v21.d[0]
889    mov       v0.d[1] , v1.d[0]
890    mov       v2.d[1] , v3.d[0]
891    mov       v4.d[1] , v5.d[0]
892    mov       v6.d[1] , v7.d[0]
893    mov       v8.d[1] , v9.d[0]
894    mov       v10.d[1] , v11.d[0]
895    mov       v12.d[1] , v13.d[0]
896    mov       v14.d[1] , v15.d[0]
897    uabd      v22.16b  , v6.16b, v8.16b
898    usra      v28.16b, v30.16b, #2      //alpha >>2 +2
899    uabd      v30.16b  , v2.16b, v6.16b
900    rshrn     v24.8b, v24.8h, #3        //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0'
901    rshrn     v25.8b, v26.8h, #3        //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0'
902    mov       v24.d[1] , v25.d[0]
903    dup       v26.16b, w3               //beta
904    cmhi      v28.16b, v28.16b , v22.16b //ABS(p0 - q0) <((Alpha >>2) + 2)
905    uaddl     v22.8h, v6.8b, v10.8b     //p0+q1 L
906    cmhi      v14.16b, v26.16b , v30.16b //beta>Ap
907    uaddl     v30.8h, v7.8b, v11.8b     //p0+q1 H
908    uaddw     v22.8h, v22.8h , v4.8b    //p0+q1+p1 L
909    uaddw     v30.8h, v30.8h , v5.8b    //p0+q1+p1 H
910    uaddw     v22.8h, v22.8h , v4.8b    //p0+q1+2*p1 L
911    uaddw     v30.8h, v30.8h , v5.8b    //p0+q1+2*p1 H
912    and       v14.16b, v14.16b , v28.16b //(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
913    rshrn     v22.8b, v22.8h, #2        //((X2(p1) + p0 + q1 + 2) >> 2) L p0"
914    rshrn     v23.8b, v30.8h, #2        //((X2(p1) + p0 + q1 + 2) >> 2) H p0"
915    mov       v22.d[1] , v23.d[0]
916    uaddl     v30.8h, v2.8b, v0.8b      //p2+p3 L
917    bif       v24.16b, v22.16b , v14.16b //p0' or p0 "
918    uaddl     v22.8h, v3.8b, v1.8b      //p2+p3 H
919    add       v30.8h, v30.8h , v30.8h   //2*(p2+p3) L
920    add       v22.8h, v22.8h , v22.8h   //2*(p2+p3)H
921    add       v16.8h, v16.8h , v30.8h   //(X2(p3) + X3(p2) + p1 + p0 + q0) L
922    add       v18.8h, v18.8h , v22.8h   //(X2(p3) + X3(p2) + p1 + p0 + q0) H
923    uabd      v30.16b  , v12.16b, v8.16b
924    uabd      v22.16b  , v10.16b, v8.16b
925    rshrn     v16.8b, v16.8h, #3        //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2'
926    rshrn     v17.8b, v18.8h, #3        //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2'
927    mov       v16.d[1] , v17.d[0]
928    uabd      v18.16b  , v4.16b, v6.16b
929    cmhi      v30.16b, v26.16b , v30.16b //Aq < Beta
930    cmhs      v22.16b, v22.16b, v26.16b
931    cmhs      v18.16b, v18.16b, v26.16b
932    dup       v26.16b, w2               //duplicate alpha
933    and       v30.16b, v30.16b , v28.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
934    uabd      v28.16b  , v6.16b, v8.16b
935    orr       v22.16b, v22.16b , v18.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
936    uaddl     v18.8h, v6.8b, v8.8b      //p0+q0 L
937    cmhs      v28.16b, v28.16b, v26.16b
938    uaddl     v26.8h, v7.8b, v9.8b      //p0+q0 H
939    uaddw     v18.8h, v18.8h , v10.8b   //p0+q0+q1 L
940    orr       v22.16b, v22.16b , v28.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha
941    uaddw     v26.8h, v26.8h , v11.8b   //p0+q0+q1 H
942    bic       v14.16b, v14.16b , v22.16b //final condn for p's
943    movi      v28.16b, #2
944    bif       v6.16b, v24.16b , v22.16b //final p0
945    bit       v2.16b, v16.16b , v14.16b //final p2
946    bif       v20.16b, v4.16b , v14.16b //final p1
947    mov       v7.d[0] , v6.d[1]
948    mov       v3.d[0] , v2.d[1]
949    mov       v21.d[0] , v20.d[1]
950    uaddl     v24.8h, v8.8b, v4.8b      //q0+p1 L
951    umlal     v24.8h, v10.8b, v28.8b    //X2(q1) + q0 + p1 L
952    uaddl     v16.8h, v9.8b, v5.8b      //q0+p1 H
953    umlal     v16.8h, v11.8b, v28.8b    //X2(q1) + q0 + p1 H
954    movi      v28.8h, #2
955    uaddl     v14.8h, v4.8b, v12.8b     //p1+q2 L
956    mla       v14.8h, v18.8h , v28.8h   //p1 + X2(p0) + X2(q0) + X2(q1) + q2L
957    uaddl     v4.8h, v5.8b, v13.8b      //p1+q2H
958    mla       v4.8h, v26.8h , v28.8h    //p1 + X2(p0) + X2(q0) + X2(q1) + q2H
959    rshrn     v24.8b, v24.8h, #2        //(X2(q1) + q0 + p1 + 2) >> 2; L q0'
960    rshrn     v25.8b, v16.8h, #2        //(X2(q1) + q0 + p1 + 2) >> 2; H q0'
961    mov       v24.d[1] , v25.d[0]
962    uaddw     v18.8h, v18.8h , v12.8b   //p0 + q0 + q1 + q2 L
963    uaddw     v26.8h, v26.8h , v13.8b   //p0 + q0 + q1 + q2 H
964    rshrn     v16.8b, v14.8h, #3        //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo"
965    mov       v14.16b, v31.16b
966    rshrn     v17.8b, v4.8h, #3         //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo"
967    mov       v16.d[1] , v17.d[0]
968    rshrn     v4.8b, v18.8h, #2         //p0 + q0 + q1 + q2 + 2)>>2 L q1'
969    rshrn     v5.8b, v26.8h, #2         //p0 + q0 + q1 + q2 + 2)>>2 H q1'
970    mov       v4.d[1] , v5.d[0]
971    bit       v24.16b, v16.16b , v30.16b //q0' or q0"
972    bic       v30.16b, v30.16b , v22.16b //final condn for q's
973    trn1      v31.8b, v0.8b, v2.8b
974    trn2      v2.8b, v0.8b, v2.8b       //row1 &2
975    mov       v0.8b, v31.8b
976    bit       v10.16b, v4.16b , v30.16b
977    mov       v11.d[0] , v10.d[1]
978    mov       v25.d[0] , v24.d[1]
979    mov       v31.d[0] , v30.d[1]
980    trn1      v31.8b, v1.8b, v3.8b
981    trn2      v3.8b, v1.8b, v3.8b       //row9 &10
982    mov       v1.8b, v31.8b
983    uaddl     v16.8h, v12.8b, v14.8b    //q2+q3 L
984    trn1      v31.8b, v20.8b, v6.8b
985    trn2      v6.8b, v20.8b, v6.8b      //row3&row4
986    mov       v20.8b , v31.8b
987    uaddl     v4.8h, v13.8b, v15.8b     //q2+q3 H
988    trn1      v31.8b, v21.8b, v7.8b
989    trn2      v7.8b, v21.8b, v7.8b      //row11 & 12
990    mov       v21.8b , v31.8b
991    mla       v18.8h, v16.8h , v28.8h   //X2(q3) + X3(q2) + q1 + q0 + p0 L
992    trn1      v31.4h, v2.4h, v6.4h
993    trn2      v6.4h, v2.4h, v6.4h       //row2 & row4
994    mov       v2.8b, v31.8b
995    mla       v26.8h, v4.8h , v28.8h    //X2(q3) + X3(q2) + q1 + q0 + p0 H
996    trn1      v31.4h, v3.4h, v7.4h
997    trn2      v7.4h, v3.4h, v7.4h       //row10 & 12
998    mov       v3.8b , v31.8b
999    bif       v8.16b, v24.16b , v22.16b //final q0
1000    mov       v9.d[0] , v8.d[1]
1001    trn1      v31.4h, v0.4h, v20.4h
1002    trn2      v20.4h, v0.4h, v20.4h     //row1 & 3
1003    mov       v0.8b , v31.8b
1004    rshrn     v18.8b, v18.8h, #3        //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L
1005    trn1      v31.4h, v1.4h, v21.4h
1006    trn2      v21.4h, v1.4h, v21.4h     //row9 & row11
1007    mov       v1.8b, v31.8b
1008    rshrn     v19.8b, v26.8h, #3        //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H
1009    mov       v18.d[1] , v19.d[0]
1010    trn1      v31.8b, v8.8b, v10.8b
1011    trn2      v10.8b, v8.8b, v10.8b     //row5&6
1012    mov       v8.8b, v31.8b
1013    bit       v12.16b, v18.16b , v30.16b //final q2
1014    mov       v13.d[0] , v12.d[1]
1015    trn1      v31.8b, v9.8b, v11.8b
1016    trn2      v11.8b, v9.8b, v11.8b     //row13 &14
1017    mov       v9.8b, v31.8b
1018    trn1      v31.8b, v12.8b, v14.8b
1019    trn2      v14.8b, v12.8b, v14.8b    //row7 & 8
1020    mov       v12.8b, v31.8b
1021    trn1      v31.8b, v13.8b, v15.8b
1022    trn2      v15.8b, v13.8b, v15.8b    //row15 & 16
1023    mov       v13.8b , v31.8b
1024    trn1      v31.4h, v10.4h, v14.4h
1025    trn2      v14.4h, v10.4h, v14.4h    //row6 & row8
1026    mov       v10.8b, v31.8b
1027    trn1      v31.4h, v11.4h, v15.4h
1028    trn2      v15.4h, v11.4h, v15.4h    //row14 & row16
1029    mov       v11.8b, v31.8b
1030    //now Q3 ->p0 and Q7->q3
1031    trn1      v31.4h, v8.4h, v12.4h
1032    trn2      v12.4h, v8.4h, v12.4h     //row 5 & 7
1033    mov       v8.8b, v31.8b
1034    trn1      v31.4h, v9.4h, v13.4h
1035    trn2      v13.4h, v9.4h, v13.4h     //row13 & row15
1036    mov       v9.8b, v31.8b
1037    sub       x0, x0, x1, lsl#4         //restore pointer
1038    trn1      v31.2s, v6.2s, v14.2s
1039    trn2      v14.2s, v6.2s, v14.2s     //row4 & 8
1040    mov       v6.8b , v31.8b
1041    trn1      v31.2s, v7.2s, v15.2s
1042    trn2      v15.2s, v7.2s, v15.2s     //row 12 & 16
1043    mov       v7.8b, v31.8b
1044    trn1      v31.2s, v0.2s, v8.2s
1045    trn2      v8.2s, v0.2s, v8.2s       //row1 & row5
1046    mov       v0.8b , v31.8b
1047    trn1      v31.2s, v1.2s, v9.2s
1048    trn2      v9.2s, v1.2s, v9.2s       //row9 & 13
1049    mov       v1.8b , v31.8b
1050    trn1      v31.2s, v2.2s, v10.2s
1051    trn2      v10.2s, v2.2s, v10.2s     //row2 &6
1052    mov       v2.8b , v31.8b
1053    trn1      v31.2s, v3.2s, v11.2s
1054    trn2      v11.2s, v3.2s, v11.2s     //row10&row14
1055    mov       v3.8b , v31.8b
1056    trn1      v31.2s, v20.2s, v12.2s
1057    trn2      v12.2s, v20.2s, v12.2s    //row3 & 7
1058    mov       v20.8b , v31.8b
1059    trn1      v31.2s, v21.2s, v13.2s
1060    trn2      v13.2s, v21.2s, v13.2s    //row11 & row15
1061    mov       v21.8b, v31.8b
1062    st1       {v0.8b}, [x0], x1         //row1
1063    st1       {v2.8b}, [x0], x1         //row2
1064    st1       {v20.8b}, [x0], x1        //row3
1065    st1       {v6.8b}, [x0], x1         //row4
1066    st1       {v8.8b}, [x0], x1         //row5
1067    st1       {v10.8b}, [x0], x1        //row6
1068    st1       {v12.8b}, [x0], x1        //row7
1069    st1       {v14.8b}, [x0], x1        //row8
1070    st1       {v1.8b}, [x0], x1         //row9
1071    st1       {v3.8b}, [x0], x1         //row10
1072    st1       {v21.8b}, [x0], x1        //row11
1073    st1       {v7.8b}, [x0], x1         //row12
1074    st1       {v9.8b}, [x0], x1         //row13
1075    st1       {v11.8b}, [x0], x1        //row14
1076    st1       {v13.8b}, [x0], x1        //row15
1077    st1       {v15.8b}, [x0], x1        //row16
1078
1079    // LDMFD sp!,{x12,pc}
1080    ldp       x19, x20, [sp], #16
1081    pop_v_regs
1082    ret
1083
1084
1085