ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s revision 25e8adb631df325607216ad6f3d6638442d9f453
1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//*  ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
24//*
25//* @brief
26//*  Contains function definitions for inter prediction  interpolation.
27//*
28//* @author
29//*  Mohit
30//*
31//* @par List of Functions:
32//*
33//*  - ih264_inter_pred_luma_horz_hpel_vert_hpel_av8()
34//*
35//* @remarks
36//*  None
37//*
38//*******************************************************************************
39//*/
40
41
42
43//void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
44//                                UWORD8 *pu1_dst,
45//                                WORD32 src_strd,,
46//                                WORD32 dst_strd,
47//                                WORD32 ht,
48//                                WORD32 wd,
49//                                    UWORD8* pu1_tmp,
50//                                  UWORD32 dydx)
51
52//**************Variables Vs Registers*****************************************
53//    x0 => *pu1_src
54//    x1 => *pu1_dst
55//    x2 =>  src_strd
56//    x3 =>  dst_strd
57//    x4 =>  ht
58//    x5 =>  wd
59
60
61.text
62.p2align 2
63.include "ih264_neon_macros.s"
64
65
66
67    .global ih264_inter_pred_luma_horz_hpel_vert_hpel_av8
68
69ih264_inter_pred_luma_horz_hpel_vert_hpel_av8:
70
71    //store register values to stack
72    push_v_regs
73    stp       x19, x20, [sp, #-16]!
74
75    sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
76    sub       x0, x0, #2                //pu1_src-2
77
78    movi      v26.8h, #0x14             // Filter coeff 20 into Q13
79    movi      v24.8h, #0x5              // Filter coeff 5  into Q12
80    movi      v27.8h, #0x14             // Filter coeff 20 into Q13
81    movi      v25.8h, #0x5              // Filter coeff 5  into Q12
82    mov       x7, #0x20
83    mov       x8, #0x30
84    subs      x12, x5, #4               //if wd=4 branch to loop_4
85    beq       loop_4_start
86
87    subs      x12, x5, #8               //if wd=8 branch to loop_8
88    beq       loop_8_start
89
90    //when  wd=16
91    movi      v28.8h, #0x14             // Filter coeff 20 into Q13
92    movi      v30.8h, #0x5              // Filter coeff 5  into Q12
93    sub       x2, x2, #16
94    ld1       {v0.2s, v1.2s}, [x0], #16 // Vector load from src[0_0]
95    ld1       {v12.2s}, [x0], x2        // Vector load from src[0_0]
96    ld1       {v2.2s, v3.2s}, [x0], #16 // Vector load from src[1_0]
97    ld1       {v13.2s}, [x0], x2        // Vector load from src[1_0]
98    ld1       {v4.2s, v5.2s}, [x0], #16 // Vector load from src[2_0]
99    ld1       {v14.2s}, [x0], x2        // Vector load from src[2_0]
100    ld1       {v6.2s, v7.2s}, [x0], #16 // Vector load from src[3_0]
101    ld1       {v15.2s}, [x0], x2        // Vector load from src[3_0]
102    ld1       {v8.2s, v9.2s}, [x0], #16 // Vector load from src[4_0]
103    ld1       {v16.2s}, [x0], x2        // Vector load from src[4_0]
104loop_16:
105
106    ld1       {v10.2s, v11.2s}, [x0], #16 // Vector load from src[5_0]
107    ld1       {v17.2s}, [x0], x2        // Vector load from src[5_0]
108
109
110    uaddl     v20.8h, v4.8b, v6.8b
111    uaddl     v18.8h, v0.8b, v10.8b
112    uaddl     v22.8h, v2.8b, v8.8b
113    mla       v18.8h, v20.8h , v28.8h
114    uaddl     v24.8h, v5.8b, v7.8b
115    uaddl     v20.8h, v1.8b, v11.8b
116    uaddl     v26.8h, v3.8b, v9.8b
117    mla       v20.8h, v24.8h , v28.8h
118    uaddl     v24.8h, v14.8b, v15.8b
119    mls       v18.8h, v22.8h , v30.8h
120    uaddl     v22.8h, v12.8b, v17.8b
121    mls       v20.8h, v26.8h , v30.8h
122    uaddl     v26.8h, v13.8b, v16.8b
123    mla       v22.8h, v24.8h , v28.8h
124    mls       v22.8h, v26.8h , v30.8h
125
126    ext       v24.16b, v18.16b , v20.16b , #4
127    ext       v26.16b, v18.16b , v20.16b , #6
128
129    ext       v23.16b, v18.16b , v20.16b , #10
130    add       v0.8h, v24.8h , v26.8h
131    ext       v24.16b, v18.16b , v20.16b , #2
132    ext       v26.16b, v18.16b , v20.16b , #8
133    add       v24.8h, v24.8h , v26.8h
134
135    saddl     v26.4s, v18.4h, v23.4h
136    smlal     v26.4s, v0.4h, v28.4h
137    smlsl     v26.4s, v24.4h, v30.4h
138
139    saddl2    v23.4s, v18.8h, v23.8h
140    smlal2    v23.4s, v0.8h, v28.8h
141    smlsl2    v23.4s, v24.8h, v30.8h
142
143    sqrshrun  v18.4h, v26.4s, #10
144    sqrshrun  v19.4h, v23.4s, #10
145
146
147    uqxtn     v18.8b, v18.8h
148    uqxtn     v19.8b, v19.8h
149    mov       v18.2s[1], v19.2s[0]
150
151    ext       v24.16b, v20.16b , v22.16b , #4
152    ext       v26.16b, v20.16b , v22.16b , #6
153    ext       v0.16b, v20.16b , v22.16b , #10
154
155    add       v25.8h, v24.8h , v26.8h
156    ext       v24.16b, v20.16b , v22.16b , #2
157    ext       v26.16b, v20.16b , v22.16b , #8
158    add       v24.8h, v24.8h , v26.8h
159
160    saddl     v26.4s, v0.4h, v20.4h
161    smlal     v26.4s, v25.4h, v28.4h
162    smlsl     v26.4s, v24.4h, v30.4h
163
164    saddl2    v22.4s, v0.8h, v20.8h
165    smlal2    v22.4s, v25.8h, v28.8h
166    smlsl2    v22.4s, v24.8h, v30.8h
167
168    sqrshrun  v19.4h, v26.4s, #10
169    sqrshrun  v25.4h, v22.4s, #10
170
171    uaddl     v24.8h, v7.8b, v9.8b
172
173
174
175    uqxtn     v19.8b, v19.8h
176    uqxtn     v25.8b, v25.8h
177    mov       v19.2s[1], v25.2s[0]
178
179    uaddl     v22.8h, v4.8b, v10.8b
180    ld1       {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0]
181
182
183    ld1       {v12.2s}, [x0], x2        // Vector load from src[6_0]
184    uaddl     v20.8h, v6.8b, v8.8b
185    uaddl     v26.8h, v5.8b, v11.8b
186    st1       {v18.2s, v19.2s}, [x1], x3 // store row 0
187
188
189//ROW_2
190
191
192    uaddl     v18.8h, v2.8b, v0.8b
193
194    mla       v18.8h, v20.8h , v28.8h
195
196    uaddl     v20.8h, v3.8b, v1.8b
197
198    mla       v20.8h, v24.8h , v28.8h
199    uaddl     v24.8h, v15.8b, v16.8b
200    mls       v18.8h, v22.8h , v30.8h
201    uaddl     v22.8h, v13.8b, v12.8b
202    mls       v20.8h, v26.8h , v30.8h
203    uaddl     v26.8h, v14.8b, v17.8b
204    mla       v22.8h, v24.8h , v28.8h
205    mls       v22.8h, v26.8h , v30.8h
206
207    ext       v24.16b, v18.16b , v20.16b , #4
208    ext       v26.16b, v18.16b , v20.16b , #6
209
210    ext       v23.16b, v18.16b , v20.16b , #10
211    add       v2.8h, v24.8h , v26.8h
212    ext       v24.16b, v18.16b , v20.16b , #2
213    ext       v26.16b, v18.16b , v20.16b , #8
214    add       v24.8h, v24.8h , v26.8h
215
216    saddl     v26.4s, v18.4h, v23.4h
217    smlal     v26.4s, v2.4h, v28.4h
218    smlsl     v26.4s, v24.4h, v30.4h
219
220    saddl2    v23.4s, v18.8h, v23.8h
221    smlal2    v23.4s, v2.8h, v28.8h
222    smlsl2    v23.4s, v24.8h, v30.8h
223
224    sqrshrun  v18.4h, v26.4s, #10
225    sqrshrun  v19.4h, v23.4s, #10
226
227
228
229    uqxtn     v18.8b, v18.8h
230    uqxtn     v19.8b, v19.8h
231    mov       v18.2s[1], v19.2s[0]
232
233    ext       v24.16b, v20.16b , v22.16b , #4
234    ext       v26.16b, v20.16b , v22.16b , #6
235    ext       v2.16b, v20.16b , v22.16b , #10
236
237    add       v25.8h, v24.8h , v26.8h
238    ext       v24.16b, v20.16b , v22.16b , #2
239    ext       v26.16b, v20.16b , v22.16b , #8
240    add       v24.8h, v24.8h , v26.8h
241
242    saddl     v26.4s, v2.4h, v20.4h
243    smlal     v26.4s, v25.4h, v28.4h
244    smlsl     v26.4s, v24.4h, v30.4h
245
246    saddl2    v22.4s, v2.8h, v20.8h
247    smlal2    v22.4s, v25.8h, v28.8h
248    smlsl2    v22.4s, v24.8h, v30.8h
249
250    sqrshrun  v19.4h, v26.4s, #10
251    sqrshrun  v25.4h, v22.4s, #10
252    uaddl     v24.8h, v9.8b, v11.8b
253
254    uqxtn     v19.8b, v19.8h
255    uqxtn     v25.8b, v25.8h
256    mov       v19.2s[1], v25.2s[0]
257
258
259    uaddl     v22.8h, v6.8b, v0.8b
260    ld1       {v2.2s, v3.2s}, [x0], #16 // Vector load from src[7_0]
261
262
263    ld1       {v13.2s}, [x0], x2        // Vector load from src[7_0]
264    uaddl     v20.8h, v8.8b, v10.8b
265    uaddl     v26.8h, v7.8b, v1.8b
266    st1       {v18.2s, v19.2s}, [x1], x3 // store row 1
267
268//ROW_3
269
270
271    uaddl     v18.8h, v4.8b, v2.8b
272
273    mla       v18.8h, v20.8h , v28.8h
274
275    uaddl     v20.8h, v5.8b, v3.8b
276
277    mla       v20.8h, v24.8h , v28.8h
278    uaddl     v24.8h, v16.8b, v17.8b
279    mls       v18.8h, v22.8h , v30.8h
280    uaddl     v22.8h, v14.8b, v13.8b
281    mls       v20.8h, v26.8h , v30.8h
282    uaddl     v26.8h, v15.8b, v12.8b
283    mla       v22.8h, v24.8h , v28.8h
284    mls       v22.8h, v26.8h , v30.8h
285
286    ext       v24.16b, v18.16b , v20.16b , #4
287    ext       v26.16b, v18.16b , v20.16b , #6
288
289    ext       v23.16b, v18.16b , v20.16b , #10
290    add       v4.8h, v24.8h , v26.8h
291    ext       v24.16b, v18.16b , v20.16b , #2
292    ext       v26.16b, v18.16b , v20.16b , #8
293    add       v24.8h, v24.8h , v26.8h
294
295    saddl     v26.4s, v18.4h, v23.4h
296    smlal     v26.4s, v4.4h, v28.4h
297    smlsl     v26.4s, v24.4h, v30.4h
298
299    saddl2    v23.4s, v18.8h, v23.8h
300    smlal2    v23.4s, v4.8h, v28.8h
301    smlsl2    v23.4s, v24.8h, v30.8h
302
303    sqrshrun  v18.4h, v26.4s, #10
304    sqrshrun  v19.4h, v23.4s, #10
305
306
307    uqxtn     v18.8b, v18.8h
308    uqxtn     v19.8b, v19.8h
309    mov       v18.2s[1], v19.2s[0]
310
311
312    ext       v24.16b, v20.16b , v22.16b , #4
313    ext       v26.16b, v20.16b , v22.16b , #6
314    ext       v4.16b, v20.16b , v22.16b , #10
315
316    add       v25.8h, v24.8h , v26.8h
317    ext       v24.16b, v20.16b , v22.16b , #2
318    ext       v26.16b, v20.16b , v22.16b , #8
319    add       v24.8h, v24.8h , v26.8h
320
321    saddl     v26.4s, v4.4h, v20.4h
322    smlal     v26.4s, v25.4h, v28.4h
323    smlsl     v26.4s, v24.4h, v30.4h
324
325    saddl2    v22.4s, v4.8h, v20.8h
326    smlal2    v22.4s, v25.8h, v28.8h
327    smlsl2    v22.4s, v24.8h, v30.8h
328
329    sqrshrun  v19.4h, v26.4s, #10
330    sqrshrun  v25.4h, v22.4s, #10
331
332    uaddl     v24.8h, v11.8b, v1.8b
333
334
335    uqxtn     v19.8b, v19.8h
336    uqxtn     v25.8b, v25.8h
337    mov       v19.2s[1], v25.2s[0]
338
339
340
341    uaddl     v22.8h, v8.8b, v2.8b
342    ld1       {v4.2s, v5.2s}, [x0], #16 // Vector load from src[8_0]
343
344
345    ld1       {v14.2s}, [x0], x2        // Vector load from src[8_0]
346    uaddl     v20.8h, v10.8b, v0.8b
347    uaddl     v26.8h, v9.8b, v3.8b
348    st1       {v18.2s, v19.2s}, [x1], x3 // store row 2
349
350
351//ROW_4
352
353    uaddl     v18.8h, v6.8b, v4.8b
354
355    mla       v18.8h, v20.8h , v28.8h
356
357    uaddl     v20.8h, v7.8b, v5.8b
358
359    mla       v20.8h, v24.8h , v28.8h
360    uaddl     v24.8h, v17.8b, v12.8b
361    mls       v18.8h, v22.8h , v30.8h
362    uaddl     v22.8h, v15.8b, v14.8b
363    mls       v20.8h, v26.8h , v30.8h
364    uaddl     v26.8h, v16.8b, v13.8b
365    mla       v22.8h, v24.8h , v28.8h
366    mls       v22.8h, v26.8h , v30.8h
367
368    ext       v24.16b, v18.16b , v20.16b , #4
369    ext       v26.16b, v18.16b , v20.16b , #6
370
371    ext       v23.16b, v18.16b , v20.16b , #10
372    add       v6.8h, v24.8h , v26.8h
373    ext       v24.16b, v18.16b , v20.16b , #2
374    ext       v26.16b, v18.16b , v20.16b , #8
375    add       v24.8h, v24.8h , v26.8h
376
377    saddl     v26.4s, v18.4h, v23.4h
378    smlal     v26.4s, v6.4h, v28.4h
379    smlsl     v26.4s, v24.4h, v30.4h
380
381    saddl2    v23.4s, v18.8h, v23.8h
382    smlal2    v23.4s, v6.8h, v28.8h
383    smlsl2    v23.4s, v24.8h, v30.8h
384
385    sqrshrun  v18.4h, v26.4s, #10
386    sqrshrun  v19.4h, v23.4s, #10
387
388    uqxtn     v18.8b, v18.8h
389    uqxtn     v19.8b, v19.8h
390    mov       v18.2s[1], v19.2s[0]
391
392
393    ext       v24.16b, v20.16b , v22.16b , #4
394    ext       v26.16b, v20.16b , v22.16b , #6
395    ext       v6.16b, v20.16b , v22.16b , #10
396
397    add       v25.8h, v24.8h , v26.8h
398    ext       v24.16b, v20.16b , v22.16b , #2
399    ext       v26.16b, v20.16b , v22.16b , #8
400    add       v24.8h, v24.8h , v26.8h
401
402    saddl     v26.4s, v6.4h, v20.4h
403    smlal     v26.4s, v25.4h, v28.4h
404    smlsl     v26.4s, v24.4h, v30.4h
405
406    saddl2    v22.4s, v6.8h, v20.8h
407    smlal2    v22.4s, v25.8h, v28.8h
408    smlsl2    v22.4s, v24.8h, v30.8h
409
410    mov       v6.16b, v2.16b
411    mov       v7.16b, v3.16b
412
413    mov       v2.16b, v10.16b
414    mov       v3.16b, v11.16b
415
416    subs      x4, x4, #4
417    sqrshrun  v19.4h, v26.4s, #10
418    sqrshrun  v25.4h, v22.4s, #10
419    mov       v10.16b, v0.16b
420    mov       v11.16b, v1.16b
421
422    mov       v24.8b, v14.8b
423
424    mov       v14.16b, v12.16b
425    mov       v15.16b, v13.16b
426
427
428    uqxtn     v19.8b, v19.8h
429    uqxtn     v25.8b, v25.8h
430    mov       v19.2s[1], v25.2s[0]
431
432
433
434    mov       v0.16b, v8.16b
435    mov       v1.16b, v9.16b
436
437    mov       v8.16b, v4.16b
438    mov       v9.16b, v5.16b
439
440    mov       v12.16b, v16.16b
441    mov       v13.16b, v17.16b
442
443    mov       v4.16b, v10.16b
444    mov       v5.16b, v11.16b
445
446    mov       v16.8b, v24.8b
447    st1       {v18.2s, v19.2s}, [x1], x3 // store row 3
448
449    bgt       loop_16                   // looping if height =16
450    b         end_func
451
452loop_8_start:
453    ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
454    ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
455    ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
456    ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
457    ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
458
459loop_8:
460
461    ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
462    uaddl     v14.8h, v4.8b, v6.8b
463    uaddl     v12.8h, v0.8b, v10.8b
464    uaddl     v16.8h, v2.8b, v8.8b
465    mla       v12.8h, v14.8h , v26.8h
466    uaddl     v18.8h, v5.8b, v7.8b
467    uaddl     v14.8h, v1.8b, v11.8b
468    uaddl     v22.8h, v3.8b, v9.8b
469    mla       v14.8h, v18.8h , v26.8h
470    mls       v12.8h, v16.8h , v24.8h
471    ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[6_0]
472    uaddl     v16.8h, v6.8b, v8.8b
473    mls       v14.8h, v22.8h , v24.8h
474    uaddl     v28.8h, v2.8b, v0.8b
475
476    ext       v22.16b, v12.16b , v14.16b , #10
477    uaddl     v18.8h, v4.8b, v10.8b
478    mla       v28.8h, v16.8h , v26.8h
479    saddl     v30.4s, v12.4h, v22.4h
480
481    saddl2    v22.4s, v12.8h, v22.8h
482    ext       v16.16b, v12.16b , v14.16b , #4
483    mls       v28.8h, v18.8h , v24.8h
484    ext       v18.16b, v12.16b , v14.16b , #6
485    ext       v20.16b, v12.16b , v14.16b , #8
486    ext       v14.16b, v12.16b , v14.16b , #2
487    add       v16.8h, v16.8h , v18.8h
488    add       v18.8h, v14.8h , v20.8h
489    uaddl     v20.8h, v7.8b, v9.8b
490    smlal     v30.4s, v16.4h, v26.4h
491    smlsl     v30.4s, v18.4h, v24.4h
492    smlal2    v22.4s, v16.8h, v26.8h
493    smlsl2    v22.4s, v18.8h, v24.8h
494    uaddl     v14.8h, v3.8b, v1.8b
495
496    mla       v14.8h, v20.8h , v26.8h
497    sqrshrun  v12.4h, v30.4s, #10
498    uaddl     v16.8h, v5.8b, v11.8b
499    sqrshrun  v13.4h, v22.4s, #10
500    mls       v14.8h, v16.8h , v24.8h
501    ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[7_0]
502    uqxtn     v25.8b, v12.8h
503    uqxtn     v13.8b, v13.8h
504    mov       v25.2s[1], v13.2s[0]
505    uaddl     v16.8h, v8.8b, v10.8b
506
507
508    ext       v22.16b, v28.16b , v14.16b , #10
509    uaddl     v20.8h, v4.8b, v2.8b
510    saddl     v30.4s, v28.4h, v22.4h
511    mla       v20.8h, v16.8h , v26.8h
512
513    saddl2    v22.4s, v28.8h, v22.8h
514    ext       v16.16b, v28.16b , v14.16b , #4
515    ext       v18.16b, v28.16b , v14.16b , #6
516    ext       v12.16b, v28.16b , v14.16b , #8
517    ext       v14.16b, v28.16b , v14.16b , #2
518    add       v16.8h, v16.8h , v18.8h
519    add       v18.8h, v12.8h , v14.8h
520
521    smlal     v30.4s, v16.4h, v26.4h
522    smlsl     v30.4s, v18.4h, v24.4h
523    smlal2    v22.4s, v16.8h, v26.8h
524    smlsl2    v22.4s, v18.8h, v24.8h
525
526
527    uaddl     v18.8h, v6.8b, v0.8b
528    sqrshrun  v16.4h, v30.4s, #10
529
530    sqrshrun  v17.4h, v22.4s, #10
531
532    mov       v12.8b, v25.8b
533    mov       v25.8b, v24.8b
534
535    uaddl     v28.8h, v9.8b, v11.8b
536    uqxtn     v13.8b, v16.8h
537    uqxtn     v17.8b, v17.8h
538    mov       v13.2s[1], v17.2s[0]
539
540
541    uaddl     v14.8h, v5.8b, v3.8b
542    uaddl     v22.8h, v7.8b, v1.8b
543    mls       v20.8h, v18.8h , v24.8h
544    st1       {v12.2s}, [x1], x3        // store row 0
545    mla       v14.8h, v28.8h , v26.8h
546    ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[8_0]
547    uaddl     v30.8h, v10.8b, v0.8b
548    uaddl     v28.8h, v6.8b, v4.8b
549    mls       v14.8h, v22.8h , v24.8h
550    st1       {v13.2s}, [x1], x3        // store row 1
551    mla       v28.8h, v30.8h , v26.8h
552
553    ext       v22.16b, v20.16b , v14.16b , #10
554    saddl     v30.4s, v20.4h, v22.4h
555
556    saddl2    v22.4s, v20.8h, v22.8h
557    ext       v16.16b, v20.16b , v14.16b , #4
558    ext       v18.16b, v20.16b , v14.16b , #6
559    ext       v12.16b, v20.16b , v14.16b , #8
560    ext       v14.16b, v20.16b , v14.16b , #2
561    add       v16.8h, v16.8h , v18.8h
562    add       v18.8h, v14.8h , v12.8h
563    uaddl     v20.8h, v8.8b, v2.8b
564    smlal     v30.4s, v16.4h, v26.4h
565    smlsl     v30.4s, v18.4h, v24.4h
566    smlal2    v22.4s, v16.8h, v26.8h
567    smlsl2    v22.4s, v18.8h, v24.8h
568    uaddl     v18.8h, v11.8b, v1.8b
569    uaddl     v16.8h, v7.8b, v5.8b
570    sqrshrun  v12.4h, v30.4s, #10
571    uaddl     v30.8h, v9.8b, v3.8b
572    mla       v16.8h, v18.8h , v26.8h
573    sqrshrun  v13.4h, v22.4s, #10
574    mls       v28.8h, v20.8h , v24.8h
575
576    mls       v16.8h, v30.8h , v24.8h
577    uqxtn     v27.8b, v12.8h
578    uqxtn     v13.8b, v13.8h
579    mov       v27.2s[1], v13.2s[0]
580
581
582    ext       v22.16b, v28.16b , v16.16b , #10
583
584    saddl     v30.4s, v28.4h, v22.4h
585
586    saddl2    v22.4s, v28.8h, v22.8h
587    ext       v12.16b, v28.16b , v16.16b , #4
588    ext       v18.16b, v28.16b , v16.16b , #6
589    ext       v20.16b, v28.16b , v16.16b , #8
590    ext       v28.16b, v28.16b , v16.16b , #2
591    add       v12.8h, v12.8h , v18.8h
592    add       v18.8h, v28.8h , v20.8h
593
594    smlal     v30.4s, v12.4h, v26.4h
595    smlsl     v30.4s, v18.4h, v24.4h
596    smlal2    v22.4s, v12.8h, v26.8h
597    smlsl2    v22.4s, v18.8h, v24.8h
598
599
600    mov       v12.8b, v27.8b
601    mov       v27.8b, v26.8b
602
603    sqrshrun  v16.4h, v30.4s, #10
604
605    mov       v6.16b, v2.16b
606    mov       v7.16b, v3.16b
607
608    sqrshrun  v17.4h, v22.4s, #10
609
610    mov       v2.16b, v10.16b
611    mov       v3.16b, v11.16b
612
613    mov       v10.16b, v0.16b
614    mov       v11.16b, v1.16b
615
616    subs      x4, x4, #4
617    uqxtn     v13.8b, v16.8h
618    uqxtn     v17.8b, v17.8h
619    mov       v13.2s[1], v17.2s[0]
620
621
622    mov       v0.16b, v8.16b
623    mov       v1.16b, v9.16b
624
625    mov       v8.16b, v4.16b
626    mov       v9.16b, v5.16b
627
628    mov       v4.16b, v10.16b
629    mov       v5.16b, v11.16b
630
631    st1       {v12.2s}, [x1], x3        // store row 2
632    st1       {v13.2s}, [x1], x3        // store row 3
633
634    bgt       loop_8                    //if height =8  loop
635    b         end_func
636
637loop_4_start:
638    ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
639    ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
640    ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
641    ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
642    ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
643
644loop_4:
645    ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
646    uaddl     v14.8h, v4.8b, v6.8b      // temp1 = src[2_0] + src[3_0]
647    uaddl     v12.8h, v0.8b, v10.8b     // temp = src[0_0] + src[5_0]
648    uaddl     v16.8h, v2.8b, v8.8b      // temp2 = src[1_0] + src[4_0]
649    mla       v12.8h, v14.8h , v26.8h   // temp += temp1 * 20
650    uaddl     v18.8h, v5.8b, v7.8b      // temp1 = src[2_0] + src[3_0]
651    uaddl     v14.8h, v1.8b, v11.8b     // temp = src[0_0] + src[5_0]
652    uaddl     v22.8h, v3.8b, v9.8b      // temp2 = src[1_0] + src[4_0]
653    mla       v14.8h, v18.8h , v26.8h   // temp += temp1 * 20
654    mls       v12.8h, v16.8h , v24.8h   // temp -= temp2 * 5
655    ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[6_0]
656    uaddl     v16.8h, v6.8b, v8.8b
657    mls       v14.8h, v22.8h , v24.8h   // temp -= temp2 * 5
658    //Q6 and Q7 have filtered values
659    uaddl     v28.8h, v2.8b, v0.8b
660
661    ext       v22.16b, v12.16b , v14.16b , #10
662    uaddl     v18.8h, v4.8b, v10.8b
663    mla       v28.8h, v16.8h , v26.8h
664    saddl     v30.4s, v12.4h, v22.4h
665
666    saddl     v22.4s, v13.4h, v23.4h
667    ext       v16.16b, v12.16b , v14.16b , #4
668    mls       v28.8h, v18.8h , v24.8h
669    ext       v18.16b, v12.16b , v14.16b , #6
670    ext       v20.16b, v12.16b , v14.16b , #8
671    ext       v14.16b, v12.16b , v14.16b , #2
672    add       v16.8h, v16.8h , v18.8h
673    add       v18.8h, v14.8h , v20.8h
674    uaddl     v20.8h, v7.8b, v9.8b
675    smlal     v30.4s, v16.4h, v26.4h
676    smlsl     v30.4s, v18.4h, v24.4h
677    smlal     v22.4s, v17.4h, v26.4h
678    smlsl     v22.4s, v19.4h, v24.4h
679    uaddl     v14.8h, v3.8b, v1.8b
680
681    mla       v14.8h, v20.8h , v26.8h
682    sqrshrun  v12.4h, v30.4s, #10
683    uaddl     v16.8h, v5.8b, v11.8b
684    sqrshrun  v13.4h, v22.4s, #10
685    mls       v14.8h, v16.8h , v24.8h
686    ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[7_0]
687    uqxtn     v25.8b, v12.8h
688    uaddl     v16.8h, v8.8b, v10.8b
689
690    ext       v22.16b, v28.16b , v14.16b , #10
691    uaddl     v20.8h, v4.8b, v2.8b
692    saddl     v30.4s, v28.4h, v22.4h
693    mla       v20.8h, v16.8h , v26.8h
694
695    saddl     v22.4s, v29.4h, v23.4h
696    ext       v16.16b, v28.16b , v14.16b , #4
697    ext       v18.16b, v28.16b , v14.16b , #6
698    ext       v12.16b, v28.16b , v14.16b , #8
699    ext       v14.16b, v28.16b , v14.16b , #2
700    add       v16.8h, v16.8h , v18.8h
701    add       v18.8h, v12.8h , v14.8h
702
703    smlal     v30.4s, v16.4h, v26.4h
704    smlsl     v30.4s, v18.4h, v24.4h
705    smlal     v22.4s, v17.4h, v26.4h
706    smlsl     v22.4s, v19.4h, v24.4h
707
708
709    uaddl     v18.8h, v6.8b, v0.8b
710    sqrshrun  v16.4h, v30.4s, #10
711
712    sqrshrun  v17.4h, v22.4s, #10
713
714    mov       v12.8b, v25.8b
715    mov       v25.8b, v24.8b
716
717    uaddl     v28.8h, v9.8b, v11.8b
718    uqxtn     v13.8b, v16.8h
719
720
721
722    uaddl     v14.8h, v5.8b, v3.8b
723    uaddl     v22.8h, v7.8b, v1.8b
724    mls       v20.8h, v18.8h , v24.8h
725    st1       {v12.s}[0], [x1], x3      // store row 0
726    mla       v14.8h, v28.8h , v26.8h
727    ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[8_0]
728    uaddl     v30.8h, v10.8b, v0.8b
729    uaddl     v28.8h, v6.8b, v4.8b
730    mls       v14.8h, v22.8h , v24.8h
731    st1       {v13.s}[0], [x1], x3      //store row 1
732    mla       v28.8h, v30.8h , v26.8h
733
734    ext       v22.16b, v20.16b , v14.16b , #10
735    saddl     v30.4s, v20.4h, v22.4h
736
737    saddl     v22.4s, v21.4h, v23.4h
738    ext       v16.16b, v20.16b , v14.16b , #4
739    ext       v18.16b, v20.16b , v14.16b , #6
740    ext       v12.16b, v20.16b , v14.16b , #8
741    ext       v14.16b, v20.16b , v14.16b , #2
742    add       v16.8h, v16.8h , v18.8h
743    add       v18.8h, v14.8h , v12.8h
744    uaddl     v20.8h, v8.8b, v2.8b
745    smlal     v30.4s, v16.4h, v26.4h
746    smlsl     v30.4s, v18.4h, v24.4h
747    smlal     v22.4s, v17.4h, v26.4h
748    smlsl     v22.4s, v19.4h, v24.4h
749    uaddl     v18.8h, v11.8b, v1.8b
750    uaddl     v16.8h, v7.8b, v5.8b
751    sqrshrun  v12.4h, v30.4s, #10
752    uaddl     v30.8h, v9.8b, v3.8b
753    mla       v16.8h, v18.8h , v26.8h
754    sqrshrun  v13.4h, v22.4s, #10
755    mls       v28.8h, v20.8h , v24.8h
756
757    mls       v16.8h, v30.8h , v24.8h
758    uqxtn     v27.8b, v12.8h
759
760    ext       v22.16b, v28.16b , v16.16b , #10
761
762    saddl     v30.4s, v28.4h, v22.4h
763
764    saddl     v22.4s, v29.4h, v23.4h
765    ext       v12.16b, v28.16b , v16.16b , #4
766    ext       v18.16b, v28.16b , v16.16b , #6
767    ext       v20.16b, v28.16b , v16.16b , #8
768    ext       v28.16b, v28.16b , v16.16b , #2
769    add       v12.8h, v12.8h , v18.8h
770    add       v18.8h, v28.8h , v20.8h
771
772    smlal     v30.4s, v12.4h, v26.4h
773    smlsl     v30.4s, v18.4h, v24.4h
774    smlal     v22.4s, v13.4h, v26.4h
775    smlsl     v22.4s, v19.4h, v24.4h
776
777
778    mov       v12.8b, v27.8b
779    mov       v27.8b, v26.8b
780
781    sqrshrun  v16.4h, v30.4s, #10
782
783    mov       v6.16b, v2.16b
784    mov       v7.16b, v3.16b
785
786    sqrshrun  v17.4h, v22.4s, #10
787
788    mov       v2.16b, v10.16b
789    mov       v3.16b, v11.16b
790
791    mov       v10.16b, v0.16b
792    mov       v11.16b, v1.16b
793
794    subs      x4, x4, #4
795    uqxtn     v13.8b, v16.8h
796
797    mov       v0.16b, v8.16b
798    mov       v1.16b, v9.16b
799
800    mov       v8.16b, v4.16b
801    mov       v9.16b, v5.16b
802
803
804    mov       v4.16b, v10.16b
805    mov       v5.16b, v11.16b
806
807
808    st1       {v12.s}[0], [x1], x3      // store row 2
809    st1       {v13.s}[0], [x1], x3      // store row 3
810
811    bgt       loop_4
812
813end_func:
814    //Restoring registers from stack
815    ldp       x19, x20, [sp], #16
816    pop_v_regs
817    ret
818
819
820
821