1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21// *******************************************************************************
22// * @file
23// *  ih264e_half_pel.s
24// *
25// * @brief
26// *
27// *
28// * @author
29// *  Ittiam
30// *
31// * @par List of Functions:
32// *  ih264e_sixtapfilter_horz
33// *  ih264e_sixtap_filter_2dvh_vert
34//
35// *
36// * @remarks
37// *  None
38// *
39// *******************************************************************************
40// */
41
42
43.text
44.p2align 2
45.include "ih264_neon_macros.s"
46
47///*******************************************************************************
48//*
49//* @brief
50//*     Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16)
51//*
52//* @par Description:
53//*    Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
54//*    sec 8.4.2.2.1 titled "Luma sample interpolation process"
55//*
56//* @param[in] pu1_src
57//*  UWORD8 pointer to the source
58//*
59//* @param[out] pu1_dst
60//*  UWORD8 pointer to the destination
61//*
62//* @param[in] src_strd
63//*  integer source stride
64//*
65//* @param[in] dst_strd
66//*  integer destination stride
67//*
68//*
69//* @returns
70//*
71//* @remarks
72//*  None
73//*
74//*******************************************************************************
75//*/
76//void ih264e_sixtapfilter_horz(UWORD8 *pu1_src,
77//                                UWORD8 *pu1_dst,
78//                                WORD32 src_strd,
79//                                WORD32 dst_strd);
80
81
82.equ halfpel_width ,  17 + 1            //( make it even, two rows are processed at a time)
83
84
85        .global ih264e_sixtapfilter_horz_av8
86ih264e_sixtapfilter_horz_av8:
87    // STMFD sp!,{x14}
88    push_v_regs
89    sxtw      x2, w2
90    sxtw      x3, w3
91    stp       x19, x20, [sp, #-16]!
92
93    movi      v0.8b, #5
94    sub       x0, x0, #2
95    sub       x3, x3, #16
96    movi      v1.8b, #20
97    mov       x14, #16
98
99filter_horz_loop:
100
101
102    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0
103    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1
104
105    //// Processing row0 and row1
106
107    ext       v31.8b, v2.8b , v3.8b , #5
108    ext       v30.8b, v3.8b , v4.8b , #5
109
110    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
111    ext       v29.8b, v4.8b , v4.8b , #5
112    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row0)
113    ext       v28.8b, v5.8b , v6.8b , #5
114    uaddl     v12.8h, v29.8b, v4.8b     //// a0 + a5                             (column3,row0)
115    ext       v27.8b, v6.8b , v7.8b , #5
116    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
117    ext       v26.8b, v7.8b , v7.8b , #5
118
119    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row1)
120    ext       v31.8b, v2.8b , v3.8b , #2
121    uaddl     v18.8h, v26.8b, v7.8b     //// a0 + a5                             (column3,row1)
122    ext       v30.8b, v3.8b , v4.8b , #2
123    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
124    ext       v29.8b, v4.8b , v4.8b , #2
125    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row0)
126    ext       v28.8b, v5.8b , v6.8b , #2
127    umlal     v12.8h, v29.8b, v1.8b     //// a0 + a5 + 20a2                         (column3,row0)
128    ext       v27.8b, v6.8b , v7.8b , #2
129    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
130    ext       v26.8b, v7.8b , v7.8b , #2
131
132    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row1)
133    ext       v31.8b, v2.8b , v3.8b , #3
134    umlal     v18.8h, v26.8b, v1.8b     //// a0 + a5 + 20a2                         (column3,row1)
135    ext       v30.8b, v3.8b , v4.8b , #3
136    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
137    ext       v29.8b, v4.8b , v4.8b , #3
138    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row0)
139    ext       v28.8b, v5.8b , v6.8b , #3
140    umlal     v12.8h, v29.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column3,row0)
141    ext       v27.8b, v6.8b , v7.8b , #3
142    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
143    ext       v26.8b, v7.8b , v7.8b , #3
144
145    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row1)
146    ext       v31.8b, v2.8b , v3.8b , #1
147    umlal     v18.8h, v26.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column3,row1)
148    ext       v30.8b, v3.8b , v4.8b , #1
149    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
150    ext       v29.8b, v4.8b , v4.8b , #1
151    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row0)
152    ext       v28.8b, v5.8b , v6.8b , #1
153    umlsl     v12.8h, v29.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column3,row0)
154    ext       v27.8b, v6.8b , v7.8b , #1
155    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
156    ext       v26.8b, v7.8b , v7.8b , #1
157
158    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row1)
159    ext       v31.8b, v2.8b , v3.8b , #4
160    umlsl     v18.8h, v26.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column3,row1)
161    ext       v30.8b, v3.8b , v4.8b , #4
162    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
163    ext       v29.8b, v4.8b , v4.8b , #4
164    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row0)
165    ext       v28.8b, v5.8b , v6.8b , #4
166    umlsl     v12.8h, v29.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column3,row0)
167    ext       v27.8b, v6.8b , v7.8b , #4
168    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
169    ext       v26.8b, v7.8b , v7.8b , #4
170
171    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row1)
172    umlsl     v18.8h, v26.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column3,row1)
173
174    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
175    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
176    sqrshrun  v22.8b, v12.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
177    sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
178    sqrshrun  v24.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row1)
179    sqrshrun  v25.8b, v18.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row1)
180
181    st1       {v20.8b, v21.8b}, [x1], #16 ////Store dest row0
182    st1       {v22.h}[0], [x1], x3
183    st1       {v23.8b, v24.8b}, [x1], #16 ////Store dest row1
184    st1       {v25.h}[0], [x1], x3
185
186    subs      x14, x14, #2              //    decrement counter
187
188    bne       filter_horz_loop
189
190
191    // LDMFD sp!,{pc}
192    ldp       x19, x20, [sp], #16
193    pop_v_regs
194    ret
195
196
197
198
199
200
201
202
203
204///**
205//*******************************************************************************
206//*
207//* @brief
208//*   This function implements a two stage cascaded six tap filter. It
209//*    applies the six tap filter in the vertical direction on the
210//*    predictor values, followed by applying the same filter in the
211//*    horizontal direction on the output of the first stage. The six tap
212//*    filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
213//*    interpolation process"
214//*    (Filter run for width = 17 and height =17)
215//* @par Description:
216//*    The function interpolates
217//*    the predictors first in the vertical direction and then in the
218//*    horizontal direction to output the (1/2,1/2). The output of the first
219//*    stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C)
220//*    in 16 bit precision.
221//*
222//*
223//* @param[in] pu1_src
224//*  UWORD8 pointer to the source
225//*
226//* @param[out] pu1_dst1
227//*  UWORD8 pointer to the destination(vertical filtered output)
228//*
229//* @param[out] pu1_dst2
230//*  UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output)
231//*
232//* @param[in] src_strd
233//*  integer source stride
234//*
235//* @param[in] dst_strd
236//*  integer destination stride of pu1_dst
237//*
238//* @param[in]pi16_pred1
239//*  Pointer to 16bit intermediate buffer(used only in c)
240//*
241//* @param[in] pi16_pred1_strd
242//*  integer destination stride of pi16_pred1
243//*
244//*
245//* @returns
246//*
247//* @remarks
248//*  None
249//*
250//*******************************************************************************
251//*/
252//void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src,
253//                                UWORD8 *pu1_dst1,
254//                                UWORD8 *pu1_dst2,
255//                                WORD32 src_strd,
256//                                WORD32 dst_strd,
257//                                WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/
258//                                WORD32 pi16_pred1_strd)
259
260
261
262
263        .global ih264e_sixtap_filter_2dvh_vert_av8
264
265ih264e_sixtap_filter_2dvh_vert_av8:
266    // STMFD sp!,{x10,x11,x12,x14}
267    push_v_regs
268    sxtw      x3, w3
269    sxtw      x4, w4
270    stp       x19, x20, [sp, #-16]!
271
272////x0 - pu1_ref
273////x3 - u4_ref_width
274
275    //// Load six rows for vertical interpolation
276    lsl       x12, x3, #1
277    sub       x0, x0, x12
278    sub       x0, x0, #2
279    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x3
280    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x3
281    ld1       {v8.8b, v9.8b, v10.8b}, [x0], x3
282    mov       x12, #5
283    ld1       {v11.8b, v12.8b, v13.8b}, [x0], x3
284    mov       x14, #20
285    ld1       {v14.8b, v15.8b, v16.8b}, [x0], x3
286    mov       v0.h[0], w12
287    mov       v0.h[1], w14
288    ld1       {v17.8b, v18.8b, v19.8b}, [x0], x3
289    movi      v1.8b, #20
290
291//// x12 - u2_buff1_width
292//// x14 - u2_buff2_width
293    mov       x12, x4
294    add       x11, x1, #16
295
296    mov       x14, x12
297
298    mov       x10, #3 //loop counter
299    sub       x16 , x12, #8
300    sub       x19, x14, #16
301filter_2dvh_loop:
302
303    //// ////////////// ROW 1 ///////////////////////
304
305//// Process first vertical interpolated row
306//// each column is
307    uaddl     v20.8h, v2.8b, v17.8b     //// a0 + a5                             (column1,row0)
308    movi      v31.8b, #5
309    umlal     v20.8h, v8.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
310    umlal     v20.8h, v11.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
311    umlsl     v20.8h, v5.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
312    umlsl     v20.8h, v14.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
313    mov       v21.d[0], v20.d[1]
314
315    uaddl     v22.8h, v3.8b, v18.8b     //// a0 + a5                                (column2,row0)
316    umlal     v22.8h, v9.8b, v1.8b      //// a0 + a5 + 20a2                        (column2,row0)
317    umlal     v22.8h, v12.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column2,row0)
318    umlsl     v22.8h, v6.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
319    umlsl     v22.8h, v15.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
320    ext       v30.8b, v20.8b , v21.8b , #4
321    mov       v23.d[0], v22.d[1]
322
323
324    uaddl     v24.8h, v4.8b, v19.8b     //// a0 + a5                                (column3,row0)
325    ext       v29.8b, v20.8b , v21.8b , #6
326    umlal     v24.8h, v10.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
327    umlal     v24.8h, v13.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
328    umlsl     v24.8h, v7.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
329    umlsl     v24.8h, v16.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
330    mov       v25.d[0], v24.d[1]
331
332    sqrshrun  v2.8b, v20.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
333    ext       v31.8b, v21.8b , v22.8b , #2
334    sqrshrun  v3.8b, v22.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
335    ext       v28.8b, v20.8b , v21.8b , #2
336
337    saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
338    ext       v31.8b, v22.8b , v23.8b , #2
339    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
340    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
341    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
342    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
343    ext       v30.8b, v21.8b , v22.8b , #4
344
345    sqrshrun  v4.8b, v24.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
346    ext       v29.8b, v21.8b , v22.8b , #6
347
348    ext       v28.8b, v21.8b , v22.8b , #2
349    saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
350    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
351    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
352    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
353    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
354    ext       v31.8b, v23.8b , v24.8b , #2
355    mov       v21.d[0], v20.d[1]
356    ext       v2.8b, v2.8b , v3.8b , #2
357    ext       v3.8b, v3.8b , v4.8b , #2
358    ext       v4.8b, v4.8b , v4.8b , #2
359
360    st1       {v2.8b, v3.8b}, [x1], x12 //// store row1 - 1,1/2 grid
361    st1       {v4.h}[0], [x11], x12     //// store row1 - 1,1/2 grid
362
363    ext       v30.8b, v22.8b , v23.8b , #4
364    ext       v29.8b, v22.8b , v23.8b , #6
365
366    saddl     v2.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
367    ext       v28.8b, v22.8b , v23.8b , #2
368    smlal     v2.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
369    smlal     v2.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
370    smlsl     v2.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
371    smlsl     v2.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
372    ext       v31.8b, v24.8b , v25.8b , #2
373
374    shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
375    ext       v30.8b, v23.8b , v24.8b , #4
376    shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
377    ext       v29.8b, v23.8b , v24.8b , #6
378
379    saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
380    ext       v28.8b, v23.8b , v24.8b , #2
381    ext       v31.8b, v25.8b , v25.8b , #2
382    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
383    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
384    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
385    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
386    ext       v30.8b, v24.8b , v25.8b , #4
387
388    saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
389    ext       v29.8b, v24.8b , v25.8b , #6
390
391    ext       v31.8b, v24.8b , v25.8b , #2
392    shrn      v28.4h, v2.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
393
394    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x3 //// Load next Row data
395    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
396    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
397    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
398    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
399    shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
400    mov       v20.d[1], v21.d[0]
401    sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
402
403
404    ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
405    ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
406
407    ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
408
409    ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
410    //// ////////////// ROW 2 ///////////////////////
411
412//// Process first vertical interpolated row
413//// each column is
414    uaddl     v20.8h, v5.8b, v2.8b      //// a0 + a5                             (column1,row0)
415    movi      v31.8b, #5
416    umlal     v20.8h, v11.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row0)
417    umlal     v20.8h, v14.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
418    umlsl     v20.8h, v8.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
419    umlsl     v20.8h, v17.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
420    mov       v21.d[0], v20.d[1]
421
422    mov       v28.d[1], v29.d[0]
423    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
424
425    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
426
427    uaddl     v22.8h, v6.8b, v3.8b      //// a0 + a5                                (column2,row0)
428    umlal     v22.8h, v12.8b, v1.8b     //// a0 + a5 + 20a2                        (column2,row0)
429    umlal     v22.8h, v15.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column2,row0)
430    umlsl     v22.8h, v9.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
431    umlsl     v22.8h, v18.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
432    mov       v23.d[0], v22.d[1]
433
434    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
435    ext       v30.8b, v20.8b , v21.8b , #4
436
437    uaddl     v24.8h, v7.8b, v4.8b      //// a0 + a5                                (column3,row0)
438    ext       v29.8b, v20.8b , v21.8b , #6
439    umlal     v24.8h, v13.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
440    umlal     v24.8h, v16.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
441    umlsl     v24.8h, v10.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
442    umlsl     v24.8h, v19.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
443    mov       v25.d[0], v24.d[1]
444
445    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
446    st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
447
448    sqrshrun  v5.8b, v20.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
449    ext       v31.8b, v21.8b , v22.8b , #2
450    sqrshrun  v6.8b, v22.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
451    ext       v28.8b, v20.8b , v21.8b , #2
452
453    saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
454    ext       v31.8b, v22.8b , v23.8b , #2
455    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
456    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
457    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
458    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
459    ext       v30.8b, v21.8b , v22.8b , #4
460
461    sqrshrun  v7.8b, v24.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
462    ext       v29.8b, v21.8b , v22.8b , #6
463
464    ext       v28.8b, v21.8b , v22.8b , #2
465    saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
466    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
467    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
468    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
469    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
470    ext       v31.8b, v23.8b , v24.8b , #2
471
472    ext       v5.8b, v5.8b , v6.8b , #2
473    ext       v6.8b, v6.8b , v7.8b , #2
474    ext       v7.8b, v7.8b , v7.8b , #2
475
476    st1       {v5.8b, v6.8b}, [x1], x12 //// store row1 - 1,1/2 grid
477    st1       {v7.h}[0], [x11], x12     //// store row1 - 1,1/2 grid
478
479    ext       v30.8b, v22.8b , v23.8b , #4
480    ext       v29.8b, v22.8b , v23.8b , #6
481
482    saddl     v6.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
483    ext       v28.8b, v22.8b , v23.8b , #2
484    smlal     v6.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
485    smlal     v6.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
486    smlsl     v6.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
487    smlsl     v6.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
488    ext       v31.8b, v24.8b , v25.8b , #2
489
490    shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
491    ext       v30.8b, v23.8b , v24.8b , #4
492    shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
493    ext       v29.8b, v23.8b , v24.8b , #6
494
495    saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
496    ext       v28.8b, v23.8b , v24.8b , #2
497    ext       v31.8b, v25.8b , v25.8b , #2
498    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
499    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
500    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
501    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
502    ext       v30.8b, v24.8b , v25.8b , #4
503
504    saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
505    ext       v29.8b, v24.8b , v25.8b , #6
506
507    ext       v31.8b, v24.8b , v25.8b , #2
508    shrn      v28.4h, v6.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
509
510    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x3 //// Load next Row data
511    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
512    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
513    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
514    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
515    shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
516    mov       v20.d[1], v21.d[0]
517    sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
518
519
520    ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
521    ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
522
523    ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
524
525    ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
526    //// ////////////// ROW 3 ///////////////////////
527
528//// Process first vertical interpolated row
529//// each column is
530    uaddl     v20.8h, v8.8b, v5.8b      //// a0 + a5                             (column1,row0)
531    movi      v31.8b, #5
532    umlal     v20.8h, v14.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row0)
533    umlal     v20.8h, v17.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
534    umlsl     v20.8h, v11.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
535    umlsl     v20.8h, v2.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
536    mov       v21.d[0], v20.d[1]
537
538    mov       v28.d[1], v29.d[0]
539    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
540    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
541
542    uaddl     v22.8h, v9.8b, v6.8b      //// a0 + a5                                (column2,row0)
543    umlal     v22.8h, v15.8b, v1.8b     //// a0 + a5 + 20a2                        (column2,row0)
544    umlal     v22.8h, v18.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column2,row0)
545    umlsl     v22.8h, v12.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
546    umlsl     v22.8h, v3.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
547    mov       v23.d[0], v22.d[1]
548
549    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
550    ext       v30.8b, v20.8b , v21.8b , #4
551
552    uaddl     v24.8h, v10.8b, v7.8b     //// a0 + a5                                (column3,row0)
553    ext       v29.8b, v20.8b , v21.8b , #6
554    umlal     v24.8h, v16.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
555    umlal     v24.8h, v19.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
556    umlsl     v24.8h, v13.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
557    umlsl     v24.8h, v4.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
558    mov       v25.d[0], v24.d[1]
559
560    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
561    st1       { v28.h}[0], [x2], x19    //// store 1/2,1,2 grif values
562
563    sqrshrun  v8.8b, v20.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
564    ext       v31.8b, v21.8b , v22.8b , #2
565    sqrshrun  v9.8b, v22.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
566    ext       v28.8b, v20.8b , v21.8b , #2
567
568    saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
569    ext       v31.8b, v22.8b , v23.8b , #2
570    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
571    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
572    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
573    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
574    ext       v30.8b, v21.8b , v22.8b , #4
575
576    sqrshrun  v10.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
577    ext       v29.8b, v21.8b , v22.8b , #6
578
579    ext       v28.8b, v21.8b , v22.8b , #2
580    saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
581    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
582    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
583    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
584    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
585    ext       v31.8b, v23.8b , v24.8b , #2
586
587    ext       v8.8b, v8.8b , v9.8b , #2
588    ext       v9.8b, v9.8b , v10.8b , #2
589    ext       v10.8b, v10.8b , v10.8b , #2
590
591    st1       {v8.8b, v9.8b}, [x1], x12 //// store row1 - 1,1/2 grid
592    st1       {v10.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
593
594    ext       v30.8b, v22.8b , v23.8b , #4
595    ext       v29.8b, v22.8b , v23.8b , #6
596
597    saddl     v8.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
598    ext       v28.8b, v22.8b , v23.8b , #2
599    smlal     v8.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
600    smlal     v8.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
601    smlsl     v8.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
602    smlsl     v8.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
603    ext       v31.8b, v24.8b , v25.8b , #2
604
605    shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
606    ext       v30.8b, v23.8b , v24.8b , #4
607    shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
608    ext       v29.8b, v23.8b , v24.8b , #6
609
610    saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
611    ext       v28.8b, v23.8b , v24.8b , #2
612    ext       v31.8b, v25.8b , v25.8b , #2
613    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
614    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
615    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
616    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
617    ext       v30.8b, v24.8b , v25.8b , #4
618
619    saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
620    ext       v29.8b, v24.8b , v25.8b , #6
621
622    ext       v31.8b, v24.8b , v25.8b , #2
623    shrn      v28.4h, v8.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
624
625    ld1       {v8.8b, v9.8b, v10.8b}, [x0], x3 //// Load next Row data
626    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
627    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
628    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
629    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
630    shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
631    mov       v20.d[1], v21.d[0]
632    sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
633
634
635    ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
636    ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
637
638    ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
639
640    ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
641    //// ////////////// ROW 4 ///////////////////////
642
643//// Process first vertical interpolated row
644//// each column is
645    uaddl     v20.8h, v11.8b, v8.8b     //// a0 + a5                             (column1,row0)
646    movi      v31.8b, #5
647    umlal     v20.8h, v17.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row0)
648    umlal     v20.8h, v2.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
649    umlsl     v20.8h, v14.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
650    umlsl     v20.8h, v5.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
651    mov       v21.d[0], v20.d[1]
652    mov       v28.d[1], v29.d[0]
653    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
654    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
655
656    uaddl     v22.8h, v12.8b, v9.8b     //// a0 + a5                                (column2,row0)
657    umlal     v22.8h, v18.8b, v1.8b     //// a0 + a5 + 20a2                        (column2,row0)
658    umlal     v22.8h, v3.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column2,row0)
659    umlsl     v22.8h, v15.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
660    umlsl     v22.8h, v6.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
661    mov       v23.d[0], v22.d[1]
662
663    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
664    ext       v30.8b, v20.8b , v21.8b , #4
665
666    uaddl     v24.8h, v13.8b, v10.8b    //// a0 + a5                                (column3,row0)
667    ext       v29.8b, v20.8b , v21.8b , #6
668    umlal     v24.8h, v19.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
669    umlal     v24.8h, v4.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column3,row0)
670    umlsl     v24.8h, v16.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
671    umlsl     v24.8h, v7.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
672    mov       v25.d[0], v24.d[1]
673
674    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
675    st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
676
677    sqrshrun  v11.8b, v20.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
678    ext       v31.8b, v21.8b , v22.8b , #2
679    sqrshrun  v12.8b, v22.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
680    ext       v28.8b, v20.8b , v21.8b , #2
681
682    saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
683    ext       v31.8b, v22.8b , v23.8b , #2
684    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
685    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
686    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
687    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
688    ext       v30.8b, v21.8b , v22.8b , #4
689
690    sqrshrun  v13.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
691    ext       v29.8b, v21.8b , v22.8b , #6
692
693    ext       v28.8b, v21.8b , v22.8b , #2
694    saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
695    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
696    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
697    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
698    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
699    ext       v31.8b, v23.8b , v24.8b , #2
700
701    ext       v11.8b, v11.8b , v12.8b , #2
702    ext       v12.8b, v12.8b , v13.8b , #2
703    ext       v13.8b, v13.8b , v13.8b , #2
704
705    st1       {v11.8b, v12.8b}, [x1], x12 //// store row1 - 1,1/2 grid
706    st1       {v13.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
707
708    ext       v30.8b, v22.8b , v23.8b , #4
709    ext       v29.8b, v22.8b , v23.8b , #6
710
711    saddl     v12.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
712    ext       v28.8b, v22.8b , v23.8b , #2
713    smlal     v12.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
714    smlal     v12.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
715    smlsl     v12.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
716    smlsl     v12.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
717    ext       v31.8b, v24.8b , v25.8b , #2
718
719    shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
720    ext       v30.8b, v23.8b , v24.8b , #4
721    shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
722    ext       v29.8b, v23.8b , v24.8b , #6
723
724    saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
725    ext       v28.8b, v23.8b , v24.8b , #2
726    ext       v31.8b, v25.8b , v25.8b , #2
727    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
728    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
729    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
730    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
731    ext       v30.8b, v24.8b , v25.8b , #4
732
733    saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
734    ext       v29.8b, v24.8b , v25.8b , #6
735
736    ext       v31.8b, v24.8b , v25.8b , #2
737    shrn      v28.4h, v12.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
738
739    ld1       {v11.8b, v12.8b, v13.8b}, [x0], x3 //// Load next Row data
740    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
741    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
742    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
743    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
744    shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
745    mov       v20.d[1], v21.d[0]
746    sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
747
748
749    ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
750    ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
751
752    ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
753
754    ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
755    //// ////////////// ROW 5 ///////////////////////
756
757//// Process first vertical interpolated row
758//// each column is
759    uaddl     v20.8h, v14.8b, v11.8b    //// a0 + a5                             (column1,row0)
760    movi      v31.8b, #5
761    umlal     v20.8h, v2.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
762    umlal     v20.8h, v5.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
763    umlsl     v20.8h, v17.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
764    umlsl     v20.8h, v8.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
765    mov       v21.d[0], v20.d[1]
766    mov       v28.d[1], v29.d[0]
767    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
768    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
769
770    uaddl     v22.8h, v15.8b, v12.8b    //// a0 + a5                                (column2,row0)
771    umlal     v22.8h, v3.8b, v1.8b      //// a0 + a5 + 20a2                        (column2,row0)
772    umlal     v22.8h, v6.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column2,row0)
773    umlsl     v22.8h, v18.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
774    umlsl     v22.8h, v9.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
775    mov       v23.d[0], v22.d[1]
776
777    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
778    ext       v30.8b, v20.8b , v21.8b , #4
779
780    uaddl     v24.8h, v16.8b, v13.8b    //// a0 + a5                                (column3,row0)
781    ext       v29.8b, v20.8b , v21.8b , #6
782    umlal     v24.8h, v4.8b, v1.8b      //// a0 + a5 + 20a2                        (column3,row0)
783    umlal     v24.8h, v7.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column3,row0)
784    umlsl     v24.8h, v19.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
785    umlsl     v24.8h, v10.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
786    mov       v25.d[0], v24.d[1]
787
788    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
789    st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
790
791    sqrshrun  v14.8b, v20.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
792    ext       v31.8b, v21.8b , v22.8b , #2
793    sqrshrun  v15.8b, v22.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
794    ext       v28.8b, v20.8b , v21.8b , #2
795
796    saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
797    ext       v31.8b, v22.8b , v23.8b , #2
798    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
799    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
800    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
801    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
802    ext       v30.8b, v21.8b , v22.8b , #4
803
804    sqrshrun  v16.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
805    ext       v29.8b, v21.8b , v22.8b , #6
806
807    ext       v28.8b, v21.8b , v22.8b , #2
808    saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
809    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
810    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
811    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
812    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
813    ext       v31.8b, v23.8b , v24.8b , #2
814
815    ext       v14.8b, v14.8b , v15.8b , #2
816    ext       v15.8b, v15.8b , v16.8b , #2
817    ext       v16.8b, v16.8b , v16.8b , #2
818
819    st1       {v14.8b, v15.8b}, [x1], x12 //// store row1 - 1,1/2 grid
820    st1       {v16.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
821
822    ext       v30.8b, v22.8b , v23.8b , #4
823    ext       v29.8b, v22.8b , v23.8b , #6
824
825    saddl     v14.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
826    ext       v28.8b, v22.8b , v23.8b , #2
827    smlal     v14.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
828    smlal     v14.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
829    smlsl     v14.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
830    smlsl     v14.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
831    ext       v31.8b, v24.8b , v25.8b , #2
832
833    shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
834    ext       v30.8b, v23.8b , v24.8b , #4
835    shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
836    ext       v29.8b, v23.8b , v24.8b , #6
837
838    saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
839    ext       v28.8b, v23.8b , v24.8b , #2
840    ext       v31.8b, v25.8b , v25.8b , #2
841    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
842    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
843    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
844    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
845    ext       v30.8b, v24.8b , v25.8b , #4
846
847    saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
848    ext       v29.8b, v24.8b , v25.8b , #6
849
850    ext       v31.8b, v24.8b , v25.8b , #2
851    shrn      v28.4h, v14.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
852
853    ld1       {v14.8b, v15.8b, v16.8b}, [x0], x3 //// Load next Row data
854    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
855    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
856    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
857    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
858    shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
859    mov       v20.d[1], v21.d[0]
860    sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
861
862
863    ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
864    ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
865
866    ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
867
868    ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
869    //// ////////////// ROW 6 ///////////////////////
870
871//// Process first vertical interpolated row
872//// each column is
873
874    cmp       x10, #1                   //// if it 17 rows are complete skip
875    beq       filter_2dvh_skip_row
876    uaddl     v20.8h, v17.8b, v14.8b    //// a0 + a5                             (column1,row0)
877    movi      v31.8b, #5
878    umlal     v20.8h, v5.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
879    umlal     v20.8h, v8.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
880    umlsl     v20.8h, v2.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
881    umlsl     v20.8h, v11.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
882    mov       v21.d[0], v20.d[1]
883    mov       v28.d[1], v29.d[0]
884    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
885    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
886
887    uaddl     v22.8h, v18.8b, v15.8b    //// a0 + a5                                (column2,row0)
888    umlal     v22.8h, v6.8b, v1.8b      //// a0 + a5 + 20a2                        (column2,row0)
889    umlal     v22.8h, v9.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column2,row0)
890    umlsl     v22.8h, v3.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
891    umlsl     v22.8h, v12.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
892    mov       v23.d[0], v22.d[1]
893
894    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
895    ext       v30.8b, v20.8b , v21.8b , #4
896
897    uaddl     v24.8h, v19.8b, v16.8b    //// a0 + a5                                (column3,row0)
898    ext       v29.8b, v20.8b , v21.8b , #6
899    umlal     v24.8h, v7.8b, v1.8b      //// a0 + a5 + 20a2                        (column3,row0)
900    umlal     v24.8h, v10.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
901    umlsl     v24.8h, v4.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
902    umlsl     v24.8h, v13.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
903    mov       v25.d[0], v24.d[1]
904
905    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
906    st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
907
908    sqrshrun  v17.8b, v20.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
909    ext       v31.8b, v21.8b , v22.8b , #2
910    sqrshrun  v18.8b, v22.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
911    ext       v28.8b, v20.8b , v21.8b , #2
912
913    saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
914    ext       v31.8b, v22.8b , v23.8b , #2
915    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
916    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
917    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
918    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
919    ext       v30.8b, v21.8b , v22.8b , #4
920
921    sqrshrun  v19.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
922    ext       v29.8b, v21.8b , v22.8b , #6
923
924    ext       v28.8b, v21.8b , v22.8b , #2
925    saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
926    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
927    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
928    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
929    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
930    ext       v31.8b, v23.8b , v24.8b , #2
931
932    ext       v17.8b, v17.8b , v18.8b , #2
933    ext       v18.8b, v18.8b , v19.8b , #2
934    ext       v19.8b, v19.8b , v19.8b , #2
935
936    st1       {v17.8b, v18.8b}, [x1], x12 //// store row1 - 1,1/2 grid
937    st1       {v19.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
938
939    ext       v30.8b, v22.8b , v23.8b , #4
940    ext       v29.8b, v22.8b , v23.8b , #6
941
942    saddl     v18.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
943    ext       v28.8b, v22.8b , v23.8b , #2
944    smlal     v18.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
945    smlal     v18.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
946    smlsl     v18.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
947    smlsl     v18.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
948    ext       v31.8b, v24.8b , v25.8b , #2
949
950    shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
951    ext       v30.8b, v23.8b , v24.8b , #4
952    shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
953    ext       v29.8b, v23.8b , v24.8b , #6
954
955    saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
956    ext       v28.8b, v23.8b , v24.8b , #2
957    ext       v31.8b, v25.8b , v25.8b , #2
958    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
959    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
960    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
961    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
962    ext       v30.8b, v24.8b , v25.8b , #4
963
964    saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
965    ext       v29.8b, v24.8b , v25.8b , #6
966
967    ext       v31.8b, v24.8b , v25.8b , #2
968    shrn      v28.4h, v18.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
969
970    ld1       {v17.8b, v18.8b, v19.8b}, [x0], x3 //// Load next Row data
971    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
972    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
973    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
974    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
975    shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
976    mov       v20.d[1], v21.d[0]
977    sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
978
979    mov       v28.d[1], v29.d[0]
980    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
981    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
982
983    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
984
985    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
986    st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
987
988    subs      x10, x10, #1              ////decrement loop counter
989
990    bne       filter_2dvh_loop
991
992
993//// Process first vertical interpolated row
994//// each column is
995    //// ////////////// ROW 13 ///////////////////////
996
997//// Process first vertical interpolated row
998//// each column is
999
1000    // LDMFD sp!,{x10,x11,x12,pc}
1001    ldp       x19, x20, [sp], #16
1002    pop_v_regs
1003    ret
1004
1005filter_2dvh_skip_row:
1006    mov       v28.d[1], v29.d[0]
1007    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
1008    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
1009
1010    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
1011
1012    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
1013    st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
1014    // LDMFD sp!,{x10,x11,x12,pc}
1015    ldp       x19, x20, [sp], #16
1016    pop_v_regs
1017    ret
1018
1019
1020///*****************************************
1021