1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//*  ih264_inter_pred_luma_horz_av8.s
24//*
25//* @brief
26//*  Contains function definitions for inter prediction  interpolation.
27//*
28//* @author
29//*  Ittiam
30//*
31//* @par List of Functions:
32//*
33//*  - ih264_inter_pred_luma_horz_av8()
34//*
35//* @remarks
36//*  None
37//*
38//*******************************************************************************
39//*/
40
41///* All the functions here are replicated from ih264_inter_pred_filters.c
42//
43
44///**
45///**
46//*******************************************************************************
47//*
48//* @brief
49//*     Interprediction luma filter for horizontal input
50//*
51//* @par Description:
52//* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
53//* sec 8.4.2.2.1 titled "Luma sample interpolation process"
54//*
55//* @param[in] pu1_src
56//*  UWORD8 pointer to the source
57//*
58//* @param[out] pu1_dst
59//*  UWORD8 pointer to the destination
60//*
61//* @param[in] src_strd
62//*  integer source stride
63//*
64//* @param[in] dst_strd
65//*  integer destination stride
66//*
67//* @param[in] ht
68//*  integer height of the array
69//*
70//* @param[in] wd
71//*  integer width of the array
72//*
73//* @returns
74//*
75// @remarks
76//*  None
77//*
78//*******************************************************************************
79//*/
80
81//void ih264_inter_pred_luma_horz (
82//                            UWORD8 *pu1_src,
83//                            UWORD8 *pu1_dst,
84//                            WORD32 src_strd,
85//                            WORD32 dst_strd,
86//                            WORD32 ht,
87//                            WORD32 wd   )
88
89//**************Variables Vs Registers*****************************************
90//    x0 => *pu1_src
91//    x1 => *pu1_dst
92//    w2 =>  src_strd
93//    w3 =>  dst_strd
94//    w4 =>  ht
95//    w5 =>  wd
96
97.text
98.p2align 2
99
100.include "ih264_neon_macros.s"
101
102
103
104    .global ih264_inter_pred_luma_horz_av8
105
106ih264_inter_pred_luma_horz_av8:
107
108
109
110
111    // STMFD sp!, {x4-x12, x14}          //store register values to stack
112    push_v_regs
113    stp       x19, x20, [sp, #-16]!
114    sxtw      x2, w2
115    sxtw      x3, w3
116    sxtw      x4, w4
117    sxtw      x5, w5
118    sub       x0, x0, #2                //pu1_src-2
119    sub       x14, x4, #16
120    movi      v0.8b, #5                 //filter coeff
121    subs      x12, x5, #8               //if wd=8 branch to loop_8
122    movi      v1.8b, #20                //filter coeff
123    beq       loop_8
124
125    subs      x12, x5, #4               //if wd=4 branch to loop_4
126    beq       loop_4
127
128loop_16:                                //when  wd=16
129    //// Processing row0 and row1
130    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0
131    add       x14, x14, #1              //for checking loop
132    ext       v31.8b, v2.8b , v3.8b, #5 ////extract a[5]                            (column1,row0)
133    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1
134    ext       v30.8b, v3.8b , v4.8b, #5 ////extract a[5]                            (column2,row0)
135    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
136    ext       v28.8b, v5.8b , v6.8b, #5 ////extract a[5]                            (column1,row1)
137    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row0)
138    ext       v27.8b, v6.8b , v7.8b, #5 ////extract a[5]                            (column2,row1)
139    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
140    ext       v31.8b, v2.8b , v3.8b, #2 ////extract a[2]                            (column1,row0)
141    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row1)
142    ext       v30.8b, v3.8b , v4.8b, #2 ////extract a[2]                            (column2,row0)
143    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
144    ext       v28.8b, v5.8b , v6.8b, #2 ////extract a[2]                            (column1,row1)
145    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row0)
146    ext       v27.8b, v6.8b , v7.8b, #2 ////extract a[2]                            (column2,row1)
147    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
148    ext       v31.8b, v2.8b , v3.8b, #3 ////extract a[3]                            (column1,row0)
149    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row1)
150    ext       v30.8b, v3.8b , v4.8b, #3 ////extract a[3]                            (column2,row0)
151    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
152    ext       v28.8b, v5.8b , v6.8b, #3 ////extract a[3]                            (column1,row1)
153    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row0)
154    ext       v27.8b, v6.8b , v7.8b, #3 ////extract a[3]                            (column2,row1)
155    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
156    ext       v31.8b, v2.8b , v3.8b, #1 ////extract a[1]                            (column1,row0)
157    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row1)
158    ext       v30.8b, v3.8b , v4.8b, #1 ////extract a[1]                            (column2,row0)
159    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
160    ext       v28.8b, v5.8b , v6.8b, #1 ////extract a[1]                            (column1,row1)
161    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row0)
162    ext       v27.8b, v6.8b , v7.8b, #1 ////extract a[1]                            (column2,row1)
163    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
164    ext       v31.8b, v2.8b , v3.8b, #4 ////extract a[4]                            (column1,row0)
165    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row1)
166    ext       v30.8b, v3.8b , v4.8b, #4 ////extract a[4]                            (column2,row0)
167    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
168    ext       v28.8b, v5.8b , v6.8b, #4 ////extract a[4]                            (column1,row1)
169    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row0)
170    ext       v27.8b, v6.8b , v7.8b, #4 ////extract a[4]                            (column2,row1)
171    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
172    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2
173    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row1)
174
175    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
176    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3
177    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
178    ext       v31.8b, v2.8b , v3.8b, #5 ////extract a[5]                            (column1,row2)
179    st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row0
180    sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
181    ext       v30.8b, v3.8b , v4.8b, #5 ////extract a[5]                            (column2,row2)
182    sqrshrun  v24.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row1)
183
184
185
186//// Processing row2 and row3
187    ext       v28.8b, v5.8b , v6.8b, #5 ////extract a[5]                            (column1,row3)
188    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row2)
189    st1       {v23.8b, v24.8b}, [x1], x3 ////Store dest row1
190    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row2)
191    ext       v27.8b, v6.8b , v7.8b, #5 ////extract a[5]                            (column2,row3)
192    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row3)
193    ext       v31.8b, v2.8b , v3.8b, #2 ////extract a[2]                            (column1,row2)
194    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row3)
195    ext       v30.8b, v3.8b , v4.8b, #2 ////extract a[2]                            (column2,row2)
196    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row2)
197    ext       v27.8b, v6.8b , v7.8b, #2 ////extract a[2]                            (column2,row3)
198    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row2)
199    ext       v28.8b, v5.8b , v6.8b, #2 ////extract a[2]                            (column1,row3)
200    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row3)
201    ext       v31.8b, v2.8b , v3.8b, #3 ////extract a[3]                            (column1,row2)
202    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row3)
203    ext       v30.8b, v3.8b , v4.8b, #3 ////extract a[3]                            (column2,row2)
204    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row2)
205    ext       v28.8b, v5.8b , v6.8b, #3 ////extract a[3]                            (column1,row3)
206    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row2)
207    ext       v27.8b, v6.8b , v7.8b, #3 ////extract a[3]                            (column2,row3)
208    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row3)
209    ext       v31.8b, v2.8b , v3.8b, #1 ////extract a[1]                            (column1,row2)
210    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row3)
211    ext       v30.8b, v3.8b , v4.8b, #1 ////extract a[1]                            (column2,row2)
212    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row2)
213    ext       v28.8b, v5.8b , v6.8b, #1 ////extract a[1]                            (column1,row3)
214    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row2)
215    ext       v27.8b, v6.8b , v7.8b, #1 ////extract a[1]                            (column2,row3)
216    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row3)
217    ext       v31.8b, v2.8b , v3.8b, #4 ////extract a[4]                            (column1,row2)
218    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row3)
219    ext       v30.8b, v3.8b , v4.8b, #4 ////extract a[4]                            (column2,row2)
220    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row2)
221    ext       v28.8b, v5.8b , v6.8b, #4 ////extract a[4]                            (column1,row3)
222    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row2)
223    ext       v27.8b, v6.8b , v7.8b, #4 ////extract a[4]                            (column2,row3)
224    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row3)
225    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4
226    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row3)
227
228    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row2)
229    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5
230    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row2)
231    ext       v31.8b, v2.8b , v3.8b, #5 ////extract a[5]                            (column1,row4)
232    st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row2
233    sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row3)
234    ext       v30.8b, v3.8b , v4.8b, #5 ////extract a[5]                            (column2,row4)
235    sqrshrun  v24.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row3)
236
237
238//// Processing row4 and row5
239    ext       v28.8b, v5.8b , v6.8b, #5 ////extract a[5]                            (column1,row5)
240    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row4)
241    st1       {v23.8b, v24.8b}, [x1], x3 ////Store dest row3
242    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row4)
243    ext       v27.8b, v6.8b , v7.8b, #5 ////extract a[5]                            (column2,row5)
244    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row5)
245    ext       v31.8b, v2.8b , v3.8b, #2 ////extract a[2]                            (column1,row4)
246    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row5)
247    ext       v30.8b, v3.8b , v4.8b, #2 ////extract a[2]                            (column2,row4)
248    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row4)
249    ext       v27.8b, v6.8b , v7.8b, #2 ////extract a[2]                            (column2,row5)
250    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row4)
251    ext       v28.8b, v5.8b , v6.8b, #2 ////extract a[2]                            (column1,row5)
252    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row5)
253    ext       v31.8b, v2.8b , v3.8b, #3 ////extract a[3]                            (column1,row4)
254    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row5)
255    ext       v30.8b, v3.8b , v4.8b, #3 ////extract a[3]                            (column2,row4)
256    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row4)
257    ext       v28.8b, v5.8b , v6.8b, #3 ////extract a[3]                            (column1,row5)
258    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row4)
259    ext       v27.8b, v6.8b , v7.8b, #3 ////extract a[3]                            (column2,row5)
260    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row5)
261    ext       v31.8b, v2.8b , v3.8b, #1 ////extract a[1]                            (column1,row4)
262    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row5)
263    ext       v30.8b, v3.8b , v4.8b, #1 ////extract a[1]                            (column2,row4)
264    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row4)
265    ext       v28.8b, v5.8b , v6.8b, #1 ////extract a[1]                            (column1,row5)
266    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row4)
267    ext       v27.8b, v6.8b , v7.8b, #1 ////extract a[1]                            (column2,row5)
268    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row4)
269    ext       v31.8b, v2.8b , v3.8b, #4 ////extract a[4]                            (column1,row4)
270    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row5)
271    ext       v30.8b, v3.8b , v4.8b, #4 ////extract a[4]                            (column2,row4)
272    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row4)
273    ext       v28.8b, v5.8b , v6.8b, #4 ////extract a[4]                            (column1,row5)
274    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row4)
275    ext       v27.8b, v6.8b , v7.8b, #4 ////extract a[4]                            (column2,row5)
276    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row5)
277    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6
278    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row5)
279
280    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row4)
281    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7
282    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row4)
283    ext       v31.8b, v2.8b , v3.8b, #5 ////extract a[5]                            (column1,row6)
284    st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row2
285    sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row5)
286    ext       v30.8b, v3.8b , v4.8b, #5 ////extract a[5]                            (column2,row6)
287    sqrshrun  v24.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row5)
288
289
290
291    //// Processing row6 and row7
292
293    ext       v28.8b, v5.8b , v6.8b, #5 ////extract a[5]                            (column1,row7)
294    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row6)
295    st1       {v23.8b, v24.8b}, [x1], x3 ////Store dest row5
296    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row6)
297    ext       v27.8b, v6.8b , v7.8b, #5 ////extract a[5]                            (column2,row7)
298    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row7)
299    ext       v31.8b, v2.8b , v3.8b, #2 ////extract a[2]                            (column1,row6)
300    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row7)
301    ext       v30.8b, v3.8b , v4.8b, #2 ////extract a[2]                            (column2,row6)
302    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row6)
303    ext       v27.8b, v6.8b , v7.8b, #2 ////extract a[2]                            (column2,row7)
304    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row6)
305    ext       v28.8b, v5.8b , v6.8b, #2 ////extract a[2]                            (column1,row7)
306    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row7)
307    ext       v31.8b, v2.8b , v3.8b, #3 ////extract a[3]                            (column1,row6)
308    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row7)
309    ext       v30.8b, v3.8b , v4.8b, #3 ////extract a[3]                            (column2,row6)
310    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row6)
311    ext       v28.8b, v5.8b , v6.8b, #3 ////extract a[3]                            (column1,row7)
312    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row6)
313    ext       v27.8b, v6.8b , v7.8b, #3 ////extract a[3]                            (column2,row7)
314    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row7)
315    ext       v31.8b, v2.8b , v3.8b, #1 ////extract a[1]                            (column1,row6)
316    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row7)
317    ext       v30.8b, v3.8b , v4.8b, #1 ////extract a[1]                            (column2,row6)
318    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row6)
319    ext       v28.8b, v5.8b , v6.8b, #1 ////extract a[1]                            (column1,row7)
320    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row6)
321    ext       v27.8b, v6.8b , v7.8b, #1 ////extract a[1]                            (column2,row7)
322    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row6)
323    ext       v31.8b, v2.8b , v3.8b, #4 ////extract a[4]                            (column1,row6)
324    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row7)
325    ext       v30.8b, v3.8b , v4.8b, #4 ////extract a[4]                            (column2,row6)
326    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row6)
327    ext       v28.8b, v5.8b , v6.8b, #4 ////extract a[4]                            (column1,row7)
328    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row6)
329    ext       v27.8b, v6.8b , v7.8b, #4 ////extract a[4]                            (column2,row6)
330
331    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row6)
332    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row7)
333    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row6)
334    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row7)
335    sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row7)
336    st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row6
337    sqrshrun  v24.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row7)
338    subs      x12, x14, #1              // if height==16  - looping
339    st1       {v23.8b, v24.8b}, [x1], x3 ////Store dest row7
340
341
342
343    beq       loop_16
344    b         end_func
345
346
347
348loop_8:
349//// Processing row0 and row1
350
351
352    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row1
353    add       x14, x14, #1              //for checking loop
354    ext       v28.8b, v5.8b , v6.8b, #5 ////extract a[5]                            (column1,row1)
355    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row0
356    ext       v25.8b, v5.8b , v6.8b, #2 ////extract a[2]                            (column1,row1)
357    ext       v31.8b, v2.8b , v3.8b, #5 ////extract a[5]                            (column1,row0)
358    ext       v24.8b, v5.8b , v6.8b, #3 ////extract a[3]                            (column1,row1)
359    ext       v23.8b, v5.8b , v6.8b, #1 ////extract a[1]                            (column1,row1)
360    ext       v22.8b, v5.8b , v6.8b, #4 ////extract a[4]                            (column1,row1)
361    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
362    ext       v29.8b, v2.8b , v3.8b, #3 ////extract a[3]                            (column1,row0)
363    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
364    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
365    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
366    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
367    ext       v30.8b, v2.8b , v3.8b, #2 ////extract a[2]                            (column1,row0)
368    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
369    ext       v27.8b, v2.8b , v3.8b, #1 ////extract a[1]                            (column1,row0)
370    ext       v26.8b, v2.8b , v3.8b, #4 ////extract a[4]                            (column1,row0)
371    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row2
372    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
373    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
374    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
375    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
376    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row3
377    sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
378
379    //// Processing row2 and row3
380    ext       v28.8b, v5.8b , v6.8b, #5 ////extract a[5]                            (column1,row3)
381    ext       v25.8b, v5.8b , v6.8b, #2 ////extract a[2]                            (column1,row3)
382    ext       v31.8b, v2.8b , v3.8b, #5 ////extract a[5]                            (column1,row2)
383    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row3)
384    st1       {v23.8b}, [x1], x3        ////Store dest row0
385    ext       v24.8b, v5.8b , v6.8b, #3 ////extract a[3]                            (column1,row2)
386    ext       v23.8b, v5.8b , v6.8b, #1 ////extract a[1]                            (column1,row3)
387    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
388    ext       v22.8b, v5.8b , v6.8b, #4 ////extract a[4]                            (column1,row3)
389    ext       v29.8b, v2.8b , v3.8b, #3 ////extract a[3]                            (column1,row2)
390    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row3)
391    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row3)
392    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row3)
393    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row3)
394    st1       {v20.8b}, [x1], x3        ////Store dest row1
395    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row2)
396    ext       v30.8b, v2.8b , v3.8b, #2 ////extract a[2]                            (column1,row2)
397    ext       v27.8b, v2.8b , v3.8b, #1 ////extract a[1]                            (column1,row2)
398    ext       v26.8b, v2.8b , v3.8b, #4 ////extract a[4]                            (column1,row2)
399    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row4
400    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row2)
401    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row2)
402    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row2)
403    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row2)
404    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row3
405    subs      x9, x4, #4
406    sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row3)
407    ext       v28.8b, v5.8b , v6.8b, #5 ////extract a[5]                            (column1,row5)
408    ext       v25.8b, v5.8b , v6.8b, #2 ////extract a[2]                            (column1,row5)
409    ext       v31.8b, v2.8b , v3.8b, #5 ////extract a[5]                            (column1,row4)
410    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row5)
411    ext       v24.8b, v5.8b , v6.8b, #3 ////extract a[3]                            (column1,row5)
412    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row2)
413    ext       v22.8b, v5.8b , v6.8b, #4 ////extract a[4]                            (column1,row5)
414    ext       v29.8b, v2.8b , v3.8b, #3 ////extract a[3]                            (column1,row4)
415    st1       {v20.8b}, [x1], x3        ////Store dest row2
416    ext       v30.8b, v2.8b , v3.8b, #2 ////extract a[2]                            (column1,row4)
417    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row4)
418    st1       {v23.8b}, [x1], x3        ////Store dest row3
419    beq       end_func                  // Branch if height==4
420
421//// Processing row4 and row5
422    ext       v23.8b, v5.8b , v6.8b, #1 ////extract a[1]                            (column1,row5)
423    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row5)
424    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row5)
425    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row5)
426    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row5)
427    ext       v27.8b, v2.8b , v3.8b, #1 ////extract a[1]                            (column1,row4)
428    ext       v26.8b, v2.8b , v3.8b, #4 ////extract a[4]                            (column1,row4)
429    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row6
430    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row4)
431    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row4)
432    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row4)
433    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row4)
434    sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row5)
435    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row7
436    ext       v31.8b, v2.8b , v3.8b, #5 ////extract a[5]                            (column1,row6)
437    ext       v28.8b, v5.8b , v6.8b, #5 ////extract a[5]                            (column1,row7)
438    ext       v25.8b, v5.8b , v6.8b, #2 ////extract a[2]                            (column1,row7)
439    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row7)
440    ext       v24.8b, v5.8b , v6.8b, #3 ////extract a[3]                            (column1,row7)
441    ext       v22.8b, v5.8b , v6.8b, #4 ////extract a[4]                            (column1,row7)
442    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row4)
443    ext       v29.8b, v2.8b , v3.8b, #3 ////extract a[3]                            (column1,row6)
444    ext       v30.8b, v2.8b , v3.8b, #2 ////extract a[2]                            (column1,row6)
445    st1       {v20.8b}, [x1], x3        ////Store dest row4
446    ext       v27.8b, v2.8b , v3.8b, #1 ////extract a[1]                            (column1,row6)
447    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row6)
448    ext       v26.8b, v2.8b , v3.8b, #4 ////extract a[4]                            (column1,row6)
449    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row6)
450    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row6)
451    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row6)
452    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row6)
453    //// Processing row6 and row7
454    st1       {v23.8b}, [x1], x3        ////Store dest row5
455    ext       v23.8b, v5.8b , v6.8b, #1 ////extract a[1]                            (column1,row7)
456    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row7)
457    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row7)
458    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row7)
459    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row7)
460    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row6)
461    subs      x12, x14, #1
462    st1       {v20.8b}, [x1], x3        ////Store dest row6
463    sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row7)
464    st1       {v23.8b}, [x1], x3        ////Store dest row7
465
466    beq       loop_8                    //looping if height ==16
467
468    b         end_func
469loop_4:
470    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row1
471    ext       v28.8b, v5.8b , v6.8b, #5 ////extract a[5]                            (column1,row1)
472    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row0
473    ext       v25.8b, v5.8b , v6.8b, #2 ////extract a[2]                            (column1,row1)
474    ext       v31.8b, v2.8b , v3.8b, #5 ////extract a[5]                            (column1,row0)
475    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
476    ext       v24.8b, v5.8b , v6.8b, #3 ////extract a[3]                            (column1,row1)
477    ext       v23.8b, v5.8b , v6.8b, #1 ////extract a[1]                            (column1,row1)
478    ext       v22.8b, v5.8b , v6.8b, #4 ////extract a[4]                            (column1,row1)
479    ext       v29.8b, v2.8b , v3.8b, #3 ////extract a[3]                            (column1,row0)
480    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
481    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
482    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
483    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
484    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
485    ext       v30.8b, v2.8b , v3.8b, #2 ////extract a[2]                            (column1,row0)
486    ext       v27.8b, v2.8b , v3.8b, #1 ////extract a[1]                            (column1,row0)
487    ext       v26.8b, v2.8b , v3.8b, #4 ////extract a[4]                            (column1,row0)
488    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row2
489    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
490    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
491    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
492    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
493    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row3
494    ext       v28.8b, v5.8b , v6.8b, #5 ////extract a[5]                            (column1,row3)
495    ext       v25.8b, v5.8b , v6.8b, #2 ////extract a[2]                            (column1,row3)
496    sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
497    ext       v31.8b, v2.8b , v3.8b, #5 ////extract a[5]                            (column1,row2)
498    ext       v24.8b, v5.8b , v6.8b, #3 ////extract a[3]                            (column1,row2)
499    st1       {v23.s}[0], [x1], x3      ////Store dest row0
500    ext       v23.8b, v5.8b , v6.8b, #1 ////extract a[1]                            (column1,row3)
501    ext       v22.8b, v5.8b , v6.8b, #4 ////extract a[4]                            (column1,row3)
502    ext       v29.8b, v2.8b , v3.8b, #3 ////extract a[3]                            (column1,row2)
503    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
504    ext       v30.8b, v2.8b , v3.8b, #2 ////extract a[2]                            (column1,row2)
505    ext       v27.8b, v2.8b , v3.8b, #1 ////extract a[1]                            (column1,row2)
506
507    //// Processing row2 and row3
508    st1       {v20.s}[0], [x1], x3      ////Store dest row1
509    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row3)
510    ext       v26.8b, v2.8b , v3.8b, #4 ////extract a[4]                            (column1,row2)
511    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row3)
512    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row3)
513    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row3)
514    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row3)
515    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row2)
516    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row2)
517    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row2)
518    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row2)
519    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row2)
520    sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row3)
521    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row2)
522    st1       {v20.s}[0], [x1], x3      ////Store dest row2
523    subs      x4, x4, #8                // Loop if height =8
524    st1       {v23.s}[0], [x1], x3      ////Store dest row3
525    beq       loop_4
526
527end_func:
528    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
529    ldp       x19, x20, [sp], #16
530    pop_v_regs
531    ret
532
533
534
535