1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//*  ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
24//*
25//* @brief
26//*  Contains function definitions for inter prediction  interpolation.
27//*
28//* @author
29//*  Mohit
30//*
31//* @par List of Functions:
32//*
33//*  - ih264_inter_pred_luma_horz_hpel_vert_qpel_av8()
34//*
35//* @remarks
36//*  None
37//*
38//*******************************************************************************
39//*/
40
41///* All the functions here are replicated from ih264_inter_pred_filters.c
42//
43
44///**
45///**
46///**
47//*******************************************************************************
48//*
49//* @brief
50//*   This function implements a two stage cascaded six tap filter. It
51//*    applies the six tap filter in the horizontal direction on the
52//*    predictor values, followed by applying the same filter in the
53//*    vertical direction on the output of the first stage. It then averages
54//*    the output of the 1st stage and the output of the 2nd stage to obtain
55//*    the quarter pel values. The six tap filtering operation is described
56//*    in sec 8.4.2.2.1 titled "Luma sample interpolation process".
57//*
58//* @par Description:
59//*     This function is called to obtain pixels lying at the following
60//*    location (1/2,1/4) or (1/2,3/4). The function interpolates
61//*    the predictors first in the horizontal direction and then in the
62//*    vertical direction to output the (1/2,1/2). It then averages
63//*      the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4)
64//*       or (1/2,3/4) depending on the offset.
65//*
66//* @param[in] pu1_src
67//*  UWORD8 pointer to the source
68//*
69//* @param[out] pu1_dst
70//*  UWORD8 pointer to the destination
71//*
72//* @param[in] src_strd
73//*  integer source stride
74//*
75//* @param[in] dst_strd
76//*  integer destination stride
77//*
78//* @param[in] ht
79//*  integer height of the array
80//*
81//* @param[in] wd
82//*  integer width of the array
83//*
84//* @param[in] pu1_tmp: temporary buffer
85//*
86//* @param[in] dydx: x and y reference offset for qpel calculations
87//*
88//* @returns
89//*
90//* @remarks
91//*  None
92//*
93//*******************************************************************************
94//*/;
95
96//void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
97//                                UWORD8 *pu1_dst,
98//                                WORD32 src_strd,,
99//                                WORD32 dst_strd,
100//                                WORD32 ht,
101//                                WORD32 wd,
102//                                    UWORD8* pu1_tmp,
103//                                  UWORD32 dydx)
104
105//**************Variables Vs Registers*****************************************
106//    x0 => *pu1_src
107//    x1 => *pu1_dst
108//    w2 =>  src_strd
109//    w3 =>  dst_strd
110//    w4 =>  ht
111//    w5 =>  wd
112//    x6 => *pu1_tmp
113//    w7 =>  dydx
114
115.text
116.p2align 2
117.include "ih264_neon_macros.s"
118
119
120
121    .global ih264_inter_pred_luma_horz_hpel_vert_qpel_av8
122
123ih264_inter_pred_luma_horz_hpel_vert_qpel_av8:
124
125
126    // store register values to stack
127    push_v_regs
128    stp       x19, x20, [sp, #-16]!
129    sxtw      x2, w2
130    sxtw      x3, w3
131    sxtw      x4, w4
132    sxtw      x5, w5
133
134
135
136    sub       x0, x0, x2, lsl #1        // pu1_src-2*src_strd
137    sub       x0, x0, #2                // pu1_src-2
138
139    mov       x9, x6
140
141                                        // by writing to w7 here, we clear the upper half of x7
142    lsr       w7, w7, #3                // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
143
144    add       x7, x7, #2
145    mov       x6, #48
146    madd      x7, x7, x6, x9
147
148    subs      x12, x5, #4               //if wd=4 branch to loop_4
149    beq       loop_4_start
150
151    subs      x12, x5, #8               //if wd=8 branch to loop_8
152    beq       loop_8_start
153
154    //when  wd=16
155    movi      v22.8h, #20               // Filter coeff 0x14 into Q11
156    movi      v24.8h, #5                // Filter coeff 0x5  into Q12
157    add       x8, x0, #8
158    add       x14, x1, #8
159    add       x10, x9, #8
160    mov       x12, x4
161    add       x11, x7, #8
162loop_16_lowhalf_start:
163    ld1       {v0.2s, v1.2s}, [x0], x2  // row -2 load for horizontal filter
164    ext       v5.8b, v0.8b , v1.8b , #5
165    uaddl     v6.8h, v0.8b, v5.8b
166
167    ext       v2.8b, v0.8b , v1.8b , #2
168    ext       v3.8b, v0.8b , v1.8b , #3
169    uaddl     v8.8h, v2.8b, v3.8b
170    ext       v4.8b, v0.8b , v1.8b , #4
171    mla       v6.8h, v8.8h , v22.8h
172    ext       v1.8b, v0.8b , v1.8b , #1
173    uaddl     v8.8h, v1.8b, v4.8b
174    ld1       {v0.2s, v1.2s}, [x0], x2  // row -1 load for horizontal filter
175    mls       v6.8h, v8.8h , v24.8h
176    ext       v5.8b, v0.8b , v1.8b , #5
177    uaddl     v8.8h, v0.8b, v5.8b
178    ext       v2.8b, v0.8b , v1.8b , #2
179    ext       v3.8b, v0.8b , v1.8b , #3
180    uaddl     v10.8h, v2.8b, v3.8b
181
182    st1       {v6.4s}, [x9], x6         // store temp buffer 0
183
184    ext       v4.8b, v0.8b , v1.8b , #4
185    mla       v8.8h, v10.8h , v22.8h
186    ext       v1.8b, v0.8b , v1.8b , #1
187    uaddl     v10.8h, v1.8b, v4.8b
188    ld1       {v0.2s, v1.2s}, [x0], x2  // row 0 load for horizontal filter
189    mls       v8.8h, v10.8h , v24.8h
190    ext       v5.8b, v0.8b , v1.8b , #5
191    uaddl     v10.8h, v0.8b, v5.8b
192    ext       v2.8b, v0.8b , v1.8b , #2
193    ext       v3.8b, v0.8b , v1.8b , #3
194    uaddl     v12.8h, v2.8b, v3.8b
195
196    st1       {v8.4s}, [x9], x6         // store temp buffer 1
197
198    ext       v4.8b, v0.8b , v1.8b , #4
199    mla       v10.8h, v12.8h , v22.8h
200    ext       v1.8b, v0.8b , v1.8b , #1
201    uaddl     v12.8h, v1.8b, v4.8b
202    ld1       {v0.2s, v1.2s}, [x0], x2  // row 1 load for horizontal filter
203    mls       v10.8h, v12.8h , v24.8h
204    ext       v5.8b, v0.8b , v1.8b , #5
205    uaddl     v12.8h, v0.8b, v5.8b
206    ext       v2.8b, v0.8b , v1.8b , #2
207    ext       v3.8b, v0.8b , v1.8b , #3
208    uaddl     v14.8h, v2.8b, v3.8b
209
210    st1       {v10.4s}, [x9], x6        // store temp buffer 2
211
212    ext       v4.8b, v0.8b , v1.8b , #4
213    mla       v12.8h, v14.8h , v22.8h
214    ext       v1.8b, v0.8b , v1.8b , #1
215    uaddl     v14.8h, v1.8b, v4.8b
216    ld1       {v0.2s, v1.2s}, [x0], x2  // row 2 load for horizontal filter
217    mls       v12.8h, v14.8h , v24.8h
218    ext       v5.8b, v0.8b , v1.8b , #5
219    uaddl     v14.8h, v0.8b, v5.8b
220    ext       v2.8b, v0.8b , v1.8b , #2
221    ext       v3.8b, v0.8b , v1.8b , #3
222    uaddl     v16.8h, v2.8b, v3.8b
223
224    st1       {v12.4s}, [x9], x6        // store temp buffer 3
225
226    ext       v4.8b, v0.8b , v1.8b , #4
227    mla       v14.8h, v16.8h , v22.8h
228    ext       v1.8b, v0.8b , v1.8b , #1
229    uaddl     v16.8h, v1.8b, v4.8b
230
231    mls       v14.8h, v16.8h , v24.8h
232loop_16_lowhalf:
233
234    ld1       {v0.2s, v1.2s}, [x0], x2  // row 3 load for horizontal filter
235    ext       v5.8b, v0.8b , v1.8b , #5
236    ext       v2.8b, v0.8b , v1.8b , #2
237    ext       v3.8b, v0.8b , v1.8b , #3
238    uaddl     v16.8h, v0.8b, v5.8b
239
240    st1       {v14.4s}, [x9], x6        // store temp buffer 4
241
242    uaddl     v18.8h, v2.8b, v3.8b
243    ext       v4.8b, v0.8b , v1.8b , #4
244    mla       v16.8h, v18.8h , v22.8h
245    ext       v1.8b, v0.8b , v1.8b , #1
246    add       v28.8h, v8.8h , v14.8h
247    uaddl     v18.8h, v1.8b, v4.8b
248    add       v30.8h, v10.8h , v12.8h
249    mls       v16.8h, v18.8h , v24.8h
250    ld1       {v0.2s, v1.2s}, [x0], x2  // row 4 load for hoorizontal filter
251    ext       v5.8b, v0.8b , v1.8b , #5
252    ext       v2.8b, v0.8b , v1.8b , #2
253    ext       v3.8b, v0.8b , v1.8b , #3
254    uaddl     v20.8h, v0.8b, v5.8b
255
256    st1       {v16.4s}, [x9], x6        // store temp buffer x5
257
258    saddl     v18.4s, v6.4h, v16.4h
259
260    ld1       {v26.4s}, [x7], x6        // load from temp buffer 0
261
262    saddl2    v6.4s, v6.8h, v16.8h
263
264    sqrshrun  v26.8b, v26.8h, #5
265
266    smlal     v18.4s, v30.4h, v22.4h
267    smlsl     v18.4s, v28.4h, v24.4h
268    smlal2    v6.4s, v30.8h, v22.8h
269    smlsl2    v6.4s, v28.8h, v24.8h
270    uaddl     v2.8h, v2.8b, v3.8b
271    ext       v4.8b, v0.8b , v1.8b , #4
272    mla       v20.8h, v2.8h , v22.8h
273    sqrshrun  v18.4h, v18.4s, #10
274    ext       v1.8b, v0.8b , v1.8b , #1
275    sqrshrun  v19.4h, v6.4s, #10
276    add       v28.8h, v10.8h , v16.8h
277    uaddl     v2.8h, v1.8b, v4.8b
278    add       v30.8h, v12.8h , v14.8h
279    mls       v20.8h, v2.8h , v24.8h
280
281    uqxtn     v18.8b, v18.8h
282    uqxtn     v19.8b, v19.8h
283    mov       v18.s[1], v19.s[0]
284
285    ld1       {v0.2s, v1.2s}, [x0], x2  // row 5 load for horizontal filter
286
287    urhadd    v26.8b, v18.8b , v26.8b
288
289    ext       v5.8b, v0.8b , v1.8b , #5
290    ext       v2.8b, v0.8b , v1.8b , #2
291
292    st1       {v20.4s}, [x9], x6        // store temp buffer x6
293
294    saddl     v18.4s, v8.4h, v20.4h
295
296    saddl2    v6.4s, v8.8h, v20.8h
297
298    ld1       {v8.4s}, [x7], x6         //load from temp buffer 1
299
300
301    st1       {v26.2s}, [x1], x3        // store row 0
302
303    smlal     v18.4s, v30.4h, v22.4h
304    smlsl     v18.4s, v28.4h, v24.4h
305    smlal2    v6.4s, v30.8h, v22.8h
306    smlsl2    v6.4s, v28.8h, v24.8h
307
308    sqrshrun  v28.8b, v8.8h, #5
309    ext       v3.8b, v0.8b , v1.8b , #3
310    uaddl     v8.8h, v0.8b, v5.8b
311    uaddl     v2.8h, v2.8b, v3.8b
312    sqrshrun  v18.4h, v18.4s, #10
313    ext       v4.8b, v0.8b , v1.8b , #4
314    sqrshrun  v19.4h, v6.4s, #10
315    mla       v8.8h, v2.8h , v22.8h
316    ext       v1.8b, v0.8b , v1.8b , #1
317    add       v26.8h, v12.8h , v20.8h
318    uaddl     v2.8h, v1.8b, v4.8b
319    uqxtn     v18.8b, v18.8h
320    uqxtn     v19.8b, v19.8h
321    mov       v18.s[1], v19.s[0]
322    add       v30.8h, v14.8h , v16.8h
323    mls       v8.8h, v2.8h , v24.8h
324    ld1       {v0.2s, v1.2s}, [x0], x2  // row 6 load for horizontal filter
325
326    urhadd    v28.8b, v28.8b , v18.8b
327
328    ext       v5.8b, v0.8b , v1.8b , #5
329    ext       v2.8b, v0.8b , v1.8b , #2
330    ext       v3.8b, v0.8b , v1.8b , #3
331
332    st1       {v28.2s}, [x1], x3        // store row 1
333
334    uaddl     v28.8h, v0.8b, v5.8b
335
336    st1       {v8.4s}, [x9], x6         // store temp buffer x7
337
338    saddl     v18.4s, v10.4h, v8.4h
339    saddl2    v6.4s, v10.8h, v8.8h
340
341    ld1       {v10.4s}, [x7], x6        // load from temp buffer 2
342
343    smlal     v18.4s, v30.4h, v22.4h
344    smlsl     v18.4s, v26.4h, v24.4h
345
346    smlal2    v6.4s, v30.8h, v22.8h
347    smlsl2    v6.4s, v26.8h, v24.8h
348
349    sqrshrun  v26.8b, v10.8h, #5
350
351    uaddl     v2.8h, v2.8b, v3.8b
352    ext       v4.8b, v0.8b , v1.8b , #4
353    mla       v28.8h, v2.8h , v22.8h
354    sqrshrun  v18.4h, v18.4s, #10
355    ext       v1.8b, v0.8b , v1.8b , #1
356    sqrshrun  v19.4h, v6.4s, #10
357    add       v10.8h, v14.8h , v8.8h
358    uaddl     v2.8h, v1.8b, v4.8b
359    add       v30.8h, v16.8h , v20.8h
360    mls       v28.8h, v2.8h , v24.8h
361    uqxtn     v27.8b, v18.8h
362    uqxtn     v19.8b, v19.8h
363    mov       v27.s[1], v19.s[0]
364    saddl     v18.4s, v12.4h, v28.4h
365    saddl2    v6.4s, v12.8h, v28.8h
366
367    urhadd    v26.8b, v26.8b , v27.8b
368
369    smlal     v18.4s, v30.4h, v22.4h
370    smlsl     v18.4s, v10.4h, v24.4h
371    smlal2    v6.4s, v30.8h, v22.8h
372    smlsl2    v6.4s, v10.8h, v24.8h
373
374    st1       {v26.2s}, [x1], x3        // store row 2
375
376    st1       {v28.2s, v29.2s}, [x9]
377
378
379    sqrshrun  v18.4h, v18.4s, #10
380
381    mov       v10.16b, v20.16b
382    mov       v11.16b, v21.16b
383    ld1       {v30.4s}, [x7], x6        // load from temp buffer 3
384
385    sqrshrun  v19.4h, v6.4s, #10
386    subs      x4, x4, #4
387
388    sqrshrun  v30.8b, v30.8h, #5
389
390    uqxtn     v18.8b, v18.8h
391    uqxtn     v19.8b, v19.8h
392    mov       v18.s[1], v19.s[0]
393
394    mov       v12.16b, v8.16b
395    mov       v13.16b, v9.16b
396    mov       v6.16b, v14.16b
397    mov       v7.16b, v15.16b
398
399    urhadd    v30.8b, v18.8b , v30.8b
400
401    mov       v8.16b, v16.16b
402    mov       v9.16b, v17.16b
403    mov       v14.16b, v28.16b
404    mov       v15.16b, v29.16b
405
406    st1       {v30.2s}, [x1], x3        // store row 3
407
408    bgt       loop_16_lowhalf           // looping if height =16
409
410
411loop_16_highhalf_start:
412    ld1       {v0.2s, v1.2s}, [x8], x2
413    ext       v5.8b, v0.8b , v1.8b , #5
414    uaddl     v6.8h, v0.8b, v5.8b
415    ext       v2.8b, v0.8b , v1.8b , #2
416    ext       v3.8b, v0.8b , v1.8b , #3
417    uaddl     v8.8h, v2.8b, v3.8b
418    ext       v4.8b, v0.8b , v1.8b , #4
419    mla       v6.8h, v8.8h , v22.8h
420    ext       v1.8b, v0.8b , v1.8b , #1
421    uaddl     v8.8h, v1.8b, v4.8b
422    ld1       {v0.2s, v1.2s}, [x8], x2
423    mls       v6.8h, v8.8h , v24.8h
424    ext       v5.8b, v0.8b , v1.8b , #5
425    uaddl     v8.8h, v0.8b, v5.8b
426    ext       v2.8b, v0.8b , v1.8b , #2
427    ext       v3.8b, v0.8b , v1.8b , #3
428    uaddl     v10.8h, v2.8b, v3.8b
429
430    st1       {v6.4s}, [x10], x6
431
432    ext       v4.8b, v0.8b , v1.8b , #4
433    mla       v8.8h, v10.8h , v22.8h
434    ext       v1.8b, v0.8b , v1.8b , #1
435    uaddl     v10.8h, v1.8b, v4.8b
436    ld1       {v0.2s, v1.2s}, [x8], x2
437    mls       v8.8h, v10.8h , v24.8h
438    ext       v5.8b, v0.8b , v1.8b , #5
439    uaddl     v10.8h, v0.8b, v5.8b
440    ext       v2.8b, v0.8b , v1.8b , #2
441    ext       v3.8b, v0.8b , v1.8b , #3
442    uaddl     v12.8h, v2.8b, v3.8b
443
444    st1       {v8.4s}, [x10], x6
445
446    ext       v4.8b, v0.8b , v1.8b , #4
447    mla       v10.8h, v12.8h , v22.8h
448    ext       v1.8b, v0.8b , v1.8b , #1
449    uaddl     v12.8h, v1.8b, v4.8b
450    ld1       {v0.2s, v1.2s}, [x8], x2
451    mls       v10.8h, v12.8h , v24.8h
452    ext       v5.8b, v0.8b , v1.8b , #5
453    uaddl     v12.8h, v0.8b, v5.8b
454    ext       v2.8b, v0.8b , v1.8b , #2
455    ext       v3.8b, v0.8b , v1.8b , #3
456    uaddl     v14.8h, v2.8b, v3.8b
457
458    st1       {v10.4s}, [x10], x6
459
460    ext       v4.8b, v0.8b , v1.8b , #4
461    mla       v12.8h, v14.8h , v22.8h
462    ext       v1.8b, v0.8b , v1.8b , #1
463    uaddl     v14.8h, v1.8b, v4.8b
464    ld1       {v0.2s, v1.2s}, [x8], x2
465    mls       v12.8h, v14.8h , v24.8h
466    ext       v5.8b, v0.8b , v1.8b , #5
467    uaddl     v14.8h, v0.8b, v5.8b
468    ext       v2.8b, v0.8b , v1.8b , #2
469    ext       v3.8b, v0.8b , v1.8b , #3
470    uaddl     v16.8h, v2.8b, v3.8b
471
472    st1       {v12.4s}, [x10], x6
473
474    ext       v4.8b, v0.8b , v1.8b , #4
475    mla       v14.8h, v16.8h , v22.8h
476    ext       v1.8b, v0.8b , v1.8b , #1
477    uaddl     v16.8h, v1.8b, v4.8b
478
479    mls       v14.8h, v16.8h , v24.8h
480
481loop_16_highhalf:
482
483    ld1       {v0.2s, v1.2s}, [x8], x2
484    ext       v5.8b, v0.8b , v1.8b , #5
485    ext       v2.8b, v0.8b , v1.8b , #2
486    ext       v3.8b, v0.8b , v1.8b , #3
487    uaddl     v16.8h, v0.8b, v5.8b
488
489    st1       {v14.4s}, [x10], x6
490
491    uaddl     v18.8h, v2.8b, v3.8b
492    ext       v4.8b, v0.8b , v1.8b , #4
493    mla       v16.8h, v18.8h , v22.8h
494    ext       v1.8b, v0.8b , v1.8b , #1
495    add       v28.8h, v8.8h , v14.8h
496    uaddl     v18.8h, v1.8b, v4.8b
497    add       v30.8h, v10.8h , v12.8h
498    mls       v16.8h, v18.8h , v24.8h
499    ld1       {v0.2s, v1.2s}, [x8], x2
500    ext       v5.8b, v0.8b , v1.8b , #5
501    ext       v2.8b, v0.8b , v1.8b , #2
502    ext       v3.8b, v0.8b , v1.8b , #3
503    uaddl     v20.8h, v0.8b, v5.8b
504
505    st1       {v16.4s}, [x10], x6
506
507    saddl     v18.4s, v6.4h, v16.4h
508
509    ld1       {v26.4s}, [x11], x6
510
511    saddl2    v6.4s, v6.8h, v16.8h
512
513    sqrshrun  v26.8b, v26.8h, #5
514
515    smlal     v18.4s, v30.4h, v22.4h
516    smlsl     v18.4s, v28.4h, v24.4h
517    smlal2    v6.4s, v30.8h, v22.8h
518    smlsl2    v6.4s, v28.8h, v24.8h
519    uaddl     v2.8h, v2.8b, v3.8b
520    ext       v4.8b, v0.8b , v1.8b , #4
521    mla       v20.8h, v2.8h , v22.8h
522    sqrshrun  v18.4h, v18.4s, #10
523    ext       v1.8b, v0.8b , v1.8b , #1
524    sqrshrun  v19.4h, v6.4s, #10
525    add       v28.8h, v10.8h , v16.8h
526    uaddl     v2.8h, v1.8b, v4.8b
527    add       v30.8h, v12.8h , v14.8h
528    mls       v20.8h, v2.8h , v24.8h
529    uqxtn     v18.8b, v18.8h
530    uqxtn     v19.8b, v19.8h
531    mov       v18.s[1], v19.s[0]
532    ld1       {v0.2s, v1.2s}, [x8], x2
533
534    urhadd    v26.8b, v18.8b , v26.8b
535
536    ext       v5.8b, v0.8b , v1.8b , #5
537    ext       v2.8b, v0.8b , v1.8b , #2
538
539    st1       {v20.4s}, [x10], x6
540
541    saddl     v18.4s, v8.4h, v20.4h
542    saddl2    v6.4s, v8.8h, v20.8h
543
544    ld1       {v8.4s}, [x11], x6
545
546
547    st1       {v26.2s}, [x14], x3       //store row 0
548
549    smlal     v18.4s, v30.4h, v22.4h
550    smlsl     v18.4s, v28.4h, v24.4h
551    smlal2    v6.4s, v30.8h, v22.8h
552    smlsl2    v6.4s, v28.8h, v24.8h
553    sqrshrun  v28.8b, v8.8h, #5
554    ext       v3.8b, v0.8b , v1.8b , #3
555    uaddl     v8.8h, v0.8b, v5.8b
556    uaddl     v2.8h, v2.8b, v3.8b
557    sqrshrun  v18.4h, v18.4s, #10
558    ext       v4.8b, v0.8b , v1.8b , #4
559    sqrshrun  v19.4h, v6.4s, #10
560    mla       v8.8h, v2.8h , v22.8h
561    ext       v1.8b, v0.8b , v1.8b , #1
562    add       v26.8h, v12.8h , v20.8h
563    uaddl     v2.8h, v1.8b, v4.8b
564    uqxtn     v18.8b, v18.8h
565    uqxtn     v19.8b, v19.8h
566    mov       v18.s[1], v19.s[0]
567    add       v30.8h, v14.8h , v16.8h
568    mls       v8.8h, v2.8h , v24.8h
569    ld1       {v0.2s, v1.2s}, [x8], x2
570
571    urhadd    v28.8b, v28.8b , v18.8b
572
573    ext       v5.8b, v0.8b , v1.8b , #5
574    ext       v2.8b, v0.8b , v1.8b , #2
575    ext       v3.8b, v0.8b , v1.8b , #3
576
577    st1       {v28.2s}, [x14], x3       //store row 1
578
579    uaddl     v28.8h, v0.8b, v5.8b
580
581    st1       {v8.4s}, [x10], x6
582
583    saddl     v18.4s, v10.4h, v8.4h
584    saddl2    v6.4s, v10.8h, v8.8h
585
586    ld1       {v10.4s}, [x11], x6
587
588    smlal     v18.4s, v30.4h, v22.4h
589    smlsl     v18.4s, v26.4h, v24.4h
590    smlal2    v6.4s, v30.8h, v22.8h
591    smlsl2    v6.4s, v26.8h, v24.8h
592
593    sqrshrun  v26.8b, v10.8h, #5
594    uaddl     v2.8h, v2.8b, v3.8b
595    ext       v4.8b, v0.8b , v1.8b , #4
596    mla       v28.8h, v2.8h , v22.8h
597    sqrshrun  v18.4h, v18.4s, #10
598    ext       v1.8b, v0.8b , v1.8b , #1
599    sqrshrun  v19.4h, v6.4s, #10
600    add       v10.8h, v14.8h , v8.8h
601    uaddl     v2.8h, v1.8b, v4.8b
602    add       v30.8h, v16.8h , v20.8h
603    mls       v28.8h, v2.8h , v24.8h
604    uqxtn     v27.8b, v18.8h
605    uqxtn     v19.8b, v19.8h
606    mov       v27.s[1], v19.s[0]
607
608
609    saddl     v18.4s, v12.4h, v28.4h
610    saddl2    v6.4s, v12.8h, v28.8h
611
612    urhadd    v26.8b, v26.8b , v27.8b
613
614    smlal     v18.4s, v30.4h, v22.4h
615    smlsl     v18.4s, v10.4h, v24.4h
616    smlal2    v6.4s, v30.8h, v22.8h
617    smlsl2    v6.4s, v10.8h, v24.8h
618
619    st1       {v26.2s}, [x14], x3       // store row 2
620
621    st1       {v28.4s}, [x10]
622
623    sqrshrun  v18.4h, v18.4s, #10
624    mov       v10.16b, v20.16b
625    mov       v11.16b, v21.16b
626    ld1       {v30.4s}, [x11], x6
627
628    sqrshrun  v19.4h, v6.4s, #10
629    subs      x12, x12, #4
630
631    sqrshrun  v30.8b, v30.8h, #5
632
633    uqxtn     v18.8b, v18.8h
634    uqxtn     v19.8b, v19.8h
635    mov       v18.s[1], v19.s[0]
636
637    mov       v12.16b, v8.16b
638    mov       v13.16b, v9.16b
639    mov       v6.16b, v14.16b
640    mov       v7.16b, v15.16b
641    urhadd    v30.8b, v18.8b , v30.8b
642
643    mov       v8.16b, v16.16b
644    mov       v9.16b, v17.16b
645    mov       v14.16b, v28.16b
646    mov       v15.16b, v29.16b
647    st1       {v30.2s}, [x14], x3       // store row 3
648
649    bgt       loop_16_highhalf          // looping if height = 8 or 16
650    b         end_func
651
652loop_8_start:
653
654    movi      v22.8h, #0x14             // Filter coeff 20 into Q11
655    movi      v24.8h, #5                // Filter coeff 5  into Q12
656    ld1       {v0.2s, v1.2s}, [x0], x2  // row -2 load for horizontal filter
657    ext       v5.8b, v0.8b , v1.8b , #5
658    uaddl     v6.8h, v0.8b, v5.8b
659
660    ext       v2.8b, v0.8b , v1.8b , #2
661    ext       v3.8b, v0.8b , v1.8b , #3
662    uaddl     v8.8h, v2.8b, v3.8b
663    ext       v4.8b, v0.8b , v1.8b , #4
664    mla       v6.8h, v8.8h , v22.8h
665    ext       v1.8b, v0.8b , v1.8b , #1
666    uaddl     v8.8h, v1.8b, v4.8b
667    ld1       {v0.2s, v1.2s}, [x0], x2  // row -1 load for horizontal filter
668    mls       v6.8h, v8.8h , v24.8h
669    ext       v5.8b, v0.8b , v1.8b , #5
670    uaddl     v8.8h, v0.8b, v5.8b
671    ext       v2.8b, v0.8b , v1.8b , #2
672    ext       v3.8b, v0.8b , v1.8b , #3
673    uaddl     v10.8h, v2.8b, v3.8b
674
675    st1       {v6.4s}, [x9], x6         // store temp buffer 0
676
677    ext       v4.8b, v0.8b , v1.8b , #4
678    mla       v8.8h, v10.8h , v22.8h
679    ext       v1.8b, v0.8b , v1.8b , #1
680    uaddl     v10.8h, v1.8b, v4.8b
681    ld1       {v0.2s, v1.2s}, [x0], x2  // row 0 load for horizontal filter
682    mls       v8.8h, v10.8h , v24.8h
683    ext       v5.8b, v0.8b , v1.8b , #5
684    uaddl     v10.8h, v0.8b, v5.8b
685    ext       v2.8b, v0.8b , v1.8b , #2
686    ext       v3.8b, v0.8b , v1.8b , #3
687    uaddl     v12.8h, v2.8b, v3.8b
688
689    st1       {v8.4s}, [x9], x6         // store temp buffer 1
690
691    ext       v4.8b, v0.8b , v1.8b , #4
692    mla       v10.8h, v12.8h , v22.8h
693    ext       v1.8b, v0.8b , v1.8b , #1
694    uaddl     v12.8h, v1.8b, v4.8b
695    ld1       {v0.2s, v1.2s}, [x0], x2  // row 1 load for horizontal filter
696    mls       v10.8h, v12.8h , v24.8h
697    ext       v5.8b, v0.8b , v1.8b , #5
698    uaddl     v12.8h, v0.8b, v5.8b
699    ext       v2.8b, v0.8b , v1.8b , #2
700    ext       v3.8b, v0.8b , v1.8b , #3
701    uaddl     v14.8h, v2.8b, v3.8b
702
703    st1       {v10.4s}, [x9], x6        // store temp buffer 2
704
705    ext       v4.8b, v0.8b , v1.8b , #4
706    mla       v12.8h, v14.8h , v22.8h
707    ext       v1.8b, v0.8b , v1.8b , #1
708    uaddl     v14.8h, v1.8b, v4.8b
709    ld1       {v0.2s, v1.2s}, [x0], x2  // row 2 load for horizontal filter
710    mls       v12.8h, v14.8h , v24.8h
711    ext       v5.8b, v0.8b , v1.8b , #5
712    uaddl     v14.8h, v0.8b, v5.8b
713    ext       v2.8b, v0.8b , v1.8b , #2
714    ext       v3.8b, v0.8b , v1.8b , #3
715    uaddl     v16.8h, v2.8b, v3.8b
716
717    st1       {v12.4s}, [x9], x6        // store temp buffer 3
718
719    ext       v4.8b, v0.8b , v1.8b , #4
720    mla       v14.8h, v16.8h , v22.8h
721    ext       v1.8b, v0.8b , v1.8b , #1
722    uaddl     v16.8h, v1.8b, v4.8b
723
724    mls       v14.8h, v16.8h , v24.8h
725loop_8:
726
727    ld1       {v0.2s, v1.2s}, [x0], x2  // row 3 load for horizontal filter
728    ext       v5.8b, v0.8b , v1.8b , #5
729    ext       v2.8b, v0.8b , v1.8b , #2
730    ext       v3.8b, v0.8b , v1.8b , #3
731    uaddl     v16.8h, v0.8b, v5.8b
732
733    st1       {v14.4s}, [x9], x6        // store temp buffer 4
734
735    uaddl     v18.8h, v2.8b, v3.8b
736    ext       v4.8b, v0.8b , v1.8b , #4
737    mla       v16.8h, v18.8h , v22.8h
738    ext       v1.8b, v0.8b , v1.8b , #1
739    add       v28.8h, v8.8h , v14.8h
740    uaddl     v18.8h, v1.8b, v4.8b
741    add       v30.8h, v10.8h , v12.8h
742    mls       v16.8h, v18.8h , v24.8h
743    ld1       {v0.2s, v1.2s}     , [x0], x2 // row 4 load for hoorizontal filter
744    ext       v5.8b, v0.8b , v1.8b , #5
745    ext       v2.8b, v0.8b , v1.8b , #2
746    ext       v3.8b, v0.8b , v1.8b , #3
747    uaddl     v20.8h, v0.8b, v5.8b
748
749    st1       {v16.4s}, [x9], x6        // store temp buffer x5
750
751    saddl     v18.4s, v6.4h, v16.4h
752
753    ld1       {v26.4s}, [x7], x6        // load from temp buffer 0
754
755    saddl2    v6.4s, v6.8h, v16.8h
756
757    sqrshrun  v26.8b, v26.8h, #5
758
759    smlal     v18.4s, v30.4h, v22.4h
760    smlsl     v18.4s, v28.4h, v24.4h
761    smlal2    v6.4s, v30.8h, v22.8h
762    smlsl2    v6.4s, v28.8h, v24.8h
763    uaddl     v2.8h, v2.8b, v3.8b
764    ext       v4.8b, v0.8b , v1.8b , #4
765    mla       v20.8h, v2.8h , v22.8h
766    sqrshrun  v18.4h, v18.4s, #10
767    ext       v1.8b, v0.8b , v1.8b , #1
768    sqrshrun  v19.4h, v6.4s, #10
769    add       v28.8h, v10.8h , v16.8h
770    uaddl     v2.8h, v1.8b, v4.8b
771    add       v30.8h, v12.8h , v14.8h
772    mls       v20.8h, v2.8h , v24.8h
773
774    uqxtn     v18.8b, v18.8h
775    uqxtn     v19.8b, v19.8h
776    mov       v18.s[1], v19.s[0]
777
778    ld1       {v0.2s, v1.2s}, [x0], x2  // row 5 load for horizontal filter
779
780    urhadd    v26.8b, v18.8b , v26.8b
781
782    ext       v5.8b, v0.8b , v1.8b , #5
783    ext       v2.8b, v0.8b , v1.8b , #2
784
785    st1       {v20.4s}, [x9], x6        // store temp buffer x6
786
787    saddl     v18.4s, v8.4h, v20.4h
788
789    saddl2    v6.4s, v8.8h, v20.8h
790
791    ld1       {v8.4s}, [x7], x6         //load from temp buffer 1
792
793
794    st1       {v26.2s}, [x1], x3        // store row 0
795
796    smlal     v18.4s, v30.4h, v22.4h
797    smlsl     v18.4s, v28.4h, v24.4h
798
799
800
801    smlal2    v6.4s, v30.8h, v22.8h
802    smlsl2    v6.4s, v28.8h, v24.8h
803
804    sqrshrun  v28.8b, v8.8h, #5
805
806    ext       v3.8b, v0.8b , v1.8b , #3
807    uaddl     v8.8h, v0.8b, v5.8b
808    uaddl     v2.8h, v2.8b, v3.8b
809    sqrshrun  v18.4h, v18.4s, #10
810    ext       v4.8b, v0.8b , v1.8b , #4
811    sqrshrun  v19.4h, v6.4s, #10
812    mla       v8.8h, v2.8h , v22.8h
813    ext       v1.8b, v0.8b , v1.8b , #1
814    add       v26.8h, v12.8h , v20.8h
815    uaddl     v2.8h, v1.8b, v4.8b
816
817
818    uqxtn     v18.8b, v18.8h
819    uqxtn     v19.8b, v19.8h
820    mov       v18.s[1], v19.s[0]
821
822    add       v30.8h, v14.8h , v16.8h
823    mls       v8.8h, v2.8h , v24.8h
824    ld1       {v0.2s, v1.2s}, [x0], x2  // row 6 load for horizontal filter
825
826    urhadd    v28.8b, v28.8b , v18.8b
827
828    ext       v5.8b, v0.8b , v1.8b , #5
829    ext       v2.8b, v0.8b , v1.8b , #2
830    ext       v3.8b, v0.8b , v1.8b , #3
831
832    st1       {v28.2s}, [x1], x3        // store row 1
833
834    uaddl     v28.8h, v0.8b, v5.8b
835
836    st1       {v8.4s}, [x9], x6         // store temp buffer x7
837
838    saddl     v18.4s, v10.4h, v8.4h
839    saddl2    v6.4s, v10.8h, v8.8h
840
841    ld1       {v10.4s}, [x7], x6        // load from temp buffer 2
842
843    smlal     v18.4s, v30.4h, v22.4h
844    smlsl     v18.4s, v26.4h, v24.4h
845    smlal2    v6.4s, v30.8h, v22.8h
846    smlsl2    v6.4s, v26.8h, v24.8h
847
848    sqrshrun  v26.8b, v10.8h, #5
849    uaddl     v2.8h, v2.8b, v3.8b
850    ext       v4.8b, v0.8b , v1.8b , #4
851    mla       v28.8h, v2.8h , v22.8h
852    sqrshrun  v18.4h, v18.4s, #10
853    ext       v1.8b, v0.8b , v1.8b , #1
854    sqrshrun  v19.4h, v6.4s, #10
855    add       v10.8h, v14.8h , v8.8h
856    uaddl     v2.8h, v1.8b, v4.8b
857    add       v30.8h, v16.8h , v20.8h
858    mls       v28.8h, v2.8h , v24.8h
859
860    uqxtn     v27.8b, v18.8h
861    uqxtn     v19.8b, v19.8h
862
863    mov       v27.s[1], v19.s[0]
864
865    saddl     v18.4s, v12.4h, v28.4h
866    saddl2    v6.4s, v12.8h, v28.8h
867
868    urhadd    v26.8b, v26.8b , v27.8b
869
870    smlal     v18.4s, v30.4h, v22.4h
871    smlsl     v18.4s, v10.4h, v24.4h
872    smlal2    v6.4s, v30.8h, v22.8h
873    smlsl2    v6.4s, v10.8h, v24.8h
874
875    st1       {v26.2s}, [x1], x3        // store row 2
876
877    st1       {v28.2s, v29.2s}, [x9]
878
879
880    sqrshrun  v18.4h, v18.4s, #10
881    mov       v10.16b, v20.16b
882    mov       v11.16b, v21.16b
883    ld1       {v30.4s}, [x7], x6        // load from temp buffer 3
884
885    sqrshrun  v19.4h, v6.4s, #10
886    subs      x4, x4, #4
887
888    sqrshrun  v30.8b, v30.8h, #5
889
890
891    uqxtn     v18.8b, v18.8h
892    uqxtn     v19.8b, v19.8h
893    mov       v18.s[1], v19.s[0]
894
895
896    mov       v12.16b, v8.16b
897    mov       v13.16b, v9.16b
898    mov       v6.16b, v14.16b
899    mov       v7.16b, v15.16b
900
901    urhadd    v30.8b, v18.8b , v30.8b
902    mov       v8.16b, v16.16b
903    mov       v9.16b, v17.16b
904    mov       v14.16b, v28.16b
905    mov       v15.16b, v29.16b
906    st1       {v30.2s}, [x1], x3        // store row 3
907
908    bgt       loop_8                    //if height =8 or 16  loop
909    b         end_func
910
911loop_4_start:
912    movi      v22.8h, #20               // Filter coeff 20 into D22
913    movi      v23.8h, #5                // Filter coeff 5  into D23
914
915    ld1       {v0.2s, v1.2s}, [x0], x2  //row -2 load
916    ext       v5.8b, v0.8b , v1.8b , #5
917    uaddl     v6.8h, v0.8b, v5.8b
918    ext       v2.8b, v0.8b , v1.8b , #2
919    ext       v3.8b, v0.8b , v1.8b , #3
920    uaddl     v8.8h, v2.8b, v3.8b
921    ext       v4.8b, v0.8b , v1.8b , #4
922    mla       v6.4h, v8.4h , v22.4h
923    ext       v1.8b, v0.8b , v1.8b , #1
924    uaddl     v8.8h, v1.8b, v4.8b
925    ld1       {v0.2s, v1.2s}, [x0], x2  // row -1 load
926    mls       v6.4h, v8.4h , v23.4h
927    ext       v5.8b, v0.8b , v1.8b , #5
928    uaddl     v8.8h, v0.8b, v5.8b
929    ext       v2.8b, v0.8b , v1.8b , #2
930    ext       v3.8b, v0.8b , v1.8b , #3
931    uaddl     v10.8h, v2.8b, v3.8b
932
933    st1       {v6.2s}, [x9], x6         // store temp buffer 0
934
935    ext       v4.8b, v0.8b , v1.8b , #4
936    mla       v8.4h, v10.4h , v22.4h
937    ext       v1.8b, v0.8b , v1.8b , #1
938    uaddl     v10.8h, v1.8b, v4.8b
939    ld1       {v0.2s, v1.2s}, [x0], x2  // row 0 load
940    mls       v8.4h, v10.4h , v23.4h
941    ext       v5.8b, v0.8b , v1.8b , #5
942    uaddl     v10.8h, v0.8b, v5.8b
943    ext       v2.8b, v0.8b , v1.8b , #2
944    ext       v3.8b, v0.8b , v1.8b , #3
945    uaddl     v12.8h, v2.8b, v3.8b
946
947    st1       {v8.2s}, [x9], x6         // store temp buffer 1
948
949    ext       v4.8b, v0.8b , v1.8b , #4
950    mla       v10.4h, v12.4h , v22.4h
951    ext       v1.8b, v0.8b , v1.8b , #1
952    uaddl     v12.8h, v1.8b, v4.8b
953    ld1       {v0.2s, v1.2s}, [x0], x2  // row 1 load
954    mls       v10.4h, v12.4h , v23.4h
955    ext       v5.8b, v0.8b , v1.8b , #5
956    uaddl     v12.8h, v0.8b, v5.8b
957    ext       v2.8b, v0.8b , v1.8b , #2
958    ext       v3.8b, v0.8b , v1.8b , #3
959    uaddl     v14.8h, v2.8b, v3.8b
960
961    st1       {v10.2s}, [x9], x6        // store temp buffer 2
962
963    ext       v4.8b, v0.8b , v1.8b , #4
964    mla       v12.4h, v14.4h , v22.4h
965    ext       v1.8b, v0.8b , v1.8b , #1
966    uaddl     v14.8h, v1.8b, v4.8b
967    ld1       {v0.2s, v1.2s}, [x0], x2  // row 2 load
968    mls       v12.4h, v14.4h , v23.4h
969    ext       v5.8b, v0.8b , v1.8b , #5
970    uaddl     v14.8h, v0.8b, v5.8b
971    ext       v2.8b, v0.8b , v1.8b , #2
972    ext       v3.8b, v0.8b , v1.8b , #3
973    uaddl     v16.8h, v2.8b, v3.8b
974    ext       v4.8b, v0.8b , v1.8b , #4
975    mla       v14.4h, v16.4h , v22.4h
976    ext       v1.8b, v0.8b , v1.8b , #1
977    uaddl     v16.8h, v1.8b, v4.8b
978
979    st1       {v12.2s}, [x9], x6        // store temp buffer 3
980
981    mls       v14.4h, v16.4h , v23.4h
982
983loop_4:
984
985    ld1       {v0.2s, v1.2s}, [x0], x2  // row 3 load
986    ext       v5.8b, v0.8b , v1.8b , #5
987    uaddl     v16.8h, v0.8b, v5.8b
988    ext       v2.8b, v0.8b , v1.8b , #2
989    ext       v3.8b, v0.8b , v1.8b , #3
990    uaddl     v18.8h, v2.8b, v3.8b
991    st1       {v14.2s}, [x9], x6        // store temp buffer 4
992    ext       v4.8b, v0.8b , v1.8b , #4
993    mla       v16.4h, v18.4h , v22.4h
994    ext       v1.8b, v0.8b , v1.8b , #1
995    uaddl     v18.8h, v1.8b, v4.8b
996    add       v2.4h, v10.4h , v12.4h
997    mls       v16.4h, v18.4h , v23.4h
998    add       v3.4h, v8.4h , v14.4h
999    ld1       {v18.2s, v19.2s}, [x0], x2 // row 4 load
1000    ext       v25.8b, v18.8b , v19.8b , #5
1001    uaddl     v26.8h, v18.8b, v25.8b
1002    ext       v20.8b, v18.8b , v19.8b , #2
1003
1004    st1       {v16.2s}, [x9], x6        // store temp buffer 5
1005
1006    saddl     v0.4s, v6.4h, v16.4h
1007    smlal     v0.4s, v2.4h, v22.4h
1008    ext       v21.8b, v18.8b , v19.8b , #3
1009    uaddl     v28.8h, v20.8b, v21.8b
1010    ext       v24.8b, v18.8b , v19.8b , #4
1011    smlsl     v0.4s, v3.4h, v23.4h
1012    mla       v26.4h, v28.4h , v22.4h
1013    ext       v19.8b, v18.8b , v19.8b , #1
1014    uaddl     v28.8h, v19.8b, v24.8b
1015    add       v2.4h, v12.4h , v14.4h
1016    mls       v26.4h, v28.4h , v23.4h
1017    sqrshrun  v0.4h, v0.4s, #0xa
1018    add       v3.4h, v10.4h , v16.4h
1019    ld1       {v18.2s, v19.2s}, [x0], x2 // row 5 load
1020    ext       v25.8b, v18.8b , v19.8b , #5
1021    uqxtn     v11.8b, v0.8h
1022    uaddl     v28.8h, v18.8b, v25.8b
1023
1024    st1       {v26.2s}, [x9], x6        // store temp buffer 6
1025
1026    //Q3 available here
1027    ld1       {v6.2s}, [x7], x6         // load from temp buffer 0
1028    ld1       {v7.2s}, [x7], x6         // load from temp buffer 1
1029
1030    sqrshrun  v9.8b, v6.8h, #5
1031    sqrshrun  v7.8b, v7.8h, #5
1032    mov       v9.s[1], v7.s[0]
1033
1034    ext       v20.8b, v18.8b , v19.8b , #2
1035
1036    saddl     v0.4s, v8.4h, v26.4h
1037    smlal     v0.4s, v2.4h, v22.4h
1038    ext       v21.8b, v18.8b , v19.8b , #3
1039    uaddl     v6.8h, v20.8b, v21.8b
1040    ext       v24.8b, v18.8b , v19.8b , #4
1041    smlsl     v0.4s, v3.4h, v23.4h
1042    mla       v28.4h, v6.4h , v22.4h
1043    ext       v19.8b, v18.8b , v19.8b , #1
1044    uaddl     v6.8h, v19.8b, v24.8b
1045    add       v2.4h, v14.4h , v16.4h
1046    mls       v28.4h, v6.4h , v23.4h
1047    sqrshrun  v0.4h, v0.4s, #0xa
1048    add       v3.4h, v12.4h , v26.4h
1049    ld1       {v18.2s, v19.2s}, [x0], x2 // row 6 load
1050    ext       v25.8b, v18.8b , v19.8b , #5
1051    uqxtn     v13.8b, v0.8h
1052
1053    trn1      v11.2s, v11.2s, v13.2s
1054    trn2      v13.2s, v11.2s, v13.2s
1055    saddl     v0.4s, v10.4h, v28.4h
1056    urhadd    v9.8b, v9.8b , v11.8b
1057
1058    st1       {v28.2s}, [x9], x6        // store temp buffer 7
1059
1060    smlal     v0.4s, v2.4h, v22.4h
1061    uaddl     v30.8h, v18.8b, v25.8b
1062
1063    st1       {v9.s}[0], [x1], x3       // store row 0
1064
1065    ext       v20.8b, v18.8b , v19.8b , #2
1066
1067    st1       {v9.s}[1], [x1], x3       // store row 1
1068
1069    ext       v21.8b, v18.8b , v19.8b , #3
1070    smlsl     v0.4s, v3.4h, v23.4h
1071    uaddl     v8.8h, v20.8b, v21.8b
1072    ext       v24.8b, v18.8b , v19.8b , #4
1073    mla       v30.4h, v8.4h , v22.4h
1074    ext       v19.8b, v18.8b , v19.8b , #1
1075    uaddl     v8.8h, v19.8b, v24.8b
1076    sqrshrun  v0.4h, v0.4s, #0xa
1077    add       v2.4h, v16.4h , v26.4h
1078    mls       v30.4h, v8.4h , v23.4h
1079    uqxtn     v4.8b, v0.8h
1080
1081    add       v3.4h, v14.4h , v28.4h
1082
1083
1084    saddl     v0.4s, v12.4h, v30.4h
1085
1086    st1       {v30.2s}, [x9]
1087
1088    smlal     v0.4s, v2.4h, v22.4h
1089
1090    ld1       {v8.2s}, [x7], x6         // load from temp buffer 2
1091    ld1       {v9.2s}, [x7], x6         // load from temp buffer 3
1092    smlsl     v0.4s, v3.4h, v23.4h
1093    subs      x4, x4, #4
1094
1095    sqrshrun  v10.8b, v8.8h, #5
1096    sqrshrun  v9.8b, v9.8h, #5
1097    mov       v10.s[1], v9.s[0]
1098
1099    mov       v12.8b, v28.8b
1100
1101    sqrshrun  v0.4h, v0.4s, #0xa
1102    mov       v6.8b, v14.8b
1103    mov       v8.8b, v16.8b
1104
1105    uqxtn     v5.8b, v0.8h
1106
1107    trn1      v4.2s, v4.2s, v5.2s
1108    trn2      v5.2s, v4.2s, v5.2s
1109    urhadd    v4.8b, v4.8b , v10.8b
1110    mov       v10.8b, v26.8b
1111    mov       v14.8b, v30.8b
1112
1113    st1       {v4.s}[0], [x1], x3       // store row 2
1114    st1       {v4.s}[1], [x1], x3       // store row 3
1115
1116    bgt       loop_4
1117
1118end_func:
1119    //Restoring registers from stack
1120    ldp       x19, x20, [sp], #16
1121    pop_v_regs
1122    ret
1123
1124
1125
1126