1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@******************************************************************************
20@* @file
21@*  ihevc_inter_pred_filters_luma_vert.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  parthiban v
31@*
32@* @par list of functions:
33@*
34@*  - ihevc_inter_pred_luma_vert()
35@*
36@* @remarks
37@*  none
38@*
39@*******************************************************************************
40@*/
41
42@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
43@/* include reconstruction */
44
45
46
47@/**
48@*******************************************************************************
49@*
50@* @brief
51@*     interprediction luma filter for vertical input
52@*
53@* @par description:
54@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
55@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
56@*    'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
57@*    assumptions : the function is optimized considering the fact width is
58@*    multiple of 4 or 8. and height as multiple of 2.
59@*
60@* @param[in] pu1_src
61@*  uword8 pointer to the source
62@*
63@* @param[out] pu1_dst
64@*  uword8 pointer to the destination
65@*
66@* @param[in] src_strd
67@*  integer source stride
68@*
69@* @param[in] dst_strd
70@*  integer destination stride
71@*
72@* @param[in] pi1_coeff
73@*  word8 pointer to the filter coefficients
74@*
75@* @param[in] ht
76@*  integer height of the array
77@*
78@* @param[in] wd
79@*  integer width of the array
80@*
81@* @returns
82@*
83@* @remarks
84@*  none
85@*
86@*******************************************************************************
87@*/
88
89@void ihevc_inter_pred_luma_vert (
90@                            uword8 *pu1_src,
91@                            uword8 *pu1_dst,
92@                            word32 src_strd,
93@                            word32 dst_strd,
94@                            word8 *pi1_coeff,
95@                            word32 ht,
96@                            word32 wd   )
97
98@**************variables vs registers*****************************************
99@   r0 => *pu1_src
100@   r1 => *pu1_dst
101@   r2 =>  src_strd
102@   r6 =>  dst_strd
103@   r12 => *pi1_coeff
104@   r5 =>  ht
105@   r3 =>  wd
106
107.equ    coeff_offset,   104
108.equ    ht_offset,      108
109.equ    wd_offset,      112
110
111.text
112.align 4
113.syntax unified
114
115
116
117.globl ihevc_inter_pred_luma_vert_a9q
118
119.type ihevc_inter_pred_luma_vert_a9q, %function
120
121ihevc_inter_pred_luma_vert_a9q:
122
123    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
124    vpush        {d8 - d15}
125
126    ldr         r12,[sp,#coeff_offset]                @load pi1_coeff
127    mov         r6,r3
128    ldr         r5,[sp,#wd_offset]                 @load wd
129    vld1.u8     {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
130    sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
131    vabs.s8     d0,d0                       @vabs_s8(coeff)
132    add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
133    ldr         r3,[sp,#ht_offset]                 @load ht
134    subs        r7,r3,#0                    @r3->ht
135    @ble        end_loops           @end loop jump
136    vdup.u8     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
137    cmp         r5,#8
138    vdup.u8     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
139    vdup.u8     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
140    vdup.u8     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
141    vdup.u8     d26,d0[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
142    vdup.u8     d27,d0[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
143    vdup.u8     d28,d0[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
144    vdup.u8     d29,d0[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
145    blt         core_loop_wd_4              @core loop wd 4 jump
146    str         r0, [sp, #-4]!
147    str         r1, [sp, #-4]!
148
149    bic         r4,r5,#7                    @r5 ->wd
150    rsb         r9,r4,r6,lsl #2             @r6->dst_strd   r5  ->wd
151    rsb         r8,r4,r2,lsl #2             @r2->src_strd
152    mov         r3, r5, lsr #3              @divide by 8
153    mul         r7, r3                      @multiply height by width
154    sub         r7, #4                      @subtract by one for epilog
155
156prolog:
157
158    and         r10, r0, #31
159    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
160    vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
161    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
162    subs        r4,r4,#8
163    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
164    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
165    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
166    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
167    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
168    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
169    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
170    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
171    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
172    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
173    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
174    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
175    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
176    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
177    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
178    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
179
180
181    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
182    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
183
184    addle       r0,r0,r8
185    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
186
187    bicle       r4,r5,#7                    @r5 ->wd
188    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
189
190    pld         [r3]
191    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
192    pld         [r3, r2]
193    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
194    pld         [r3, r2, lsl #1]
195    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
196
197    add         r3, r3, r2
198    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
199
200    pld         [r3, r2, lsl #1]
201    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
202
203    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
204    vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
205
206    vld1.u8     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
207    vmull.u8    q6,d3,d23
208    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
209    vmlsl.u8    q6,d2,d22
210    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
211    vmlsl.u8    q6,d4,d24
212    vmlal.u8    q6,d5,d25
213    vmlal.u8    q6,d6,d26
214    vmlsl.u8    q6,d7,d27
215    vmlal.u8    q6,d16,d28
216    vmlsl.u8    q6,d17,d29
217    add         r14,r1,r6
218    vst1.8      {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
219    vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
220    addle       r1,r1,r9
221
222    vmull.u8    q7,d4,d23
223    subs        r7,r7,#4
224    vmlsl.u8    q7,d3,d22
225    vmlsl.u8    q7,d5,d24
226    vmlal.u8    q7,d6,d25
227    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
228    vmlal.u8    q7,d7,d26
229    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
230    vmlsl.u8    q7,d16,d27
231    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
232    vmlal.u8    q7,d17,d28
233    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
234    vmlsl.u8    q7,d18,d29
235    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
236
237    vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
238    vqrshrun.s16 d12,q6,#6
239
240
241    blt         epilog_end                  @jumps to epilog_end
242    beq         epilog                      @jumps to epilog
243
244kernel_8:
245
246    subs        r4,r4,#8
247    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
248
249    addle       r0,r0,r8
250    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
251
252    bicle       r4,r5,#7                    @r5 ->wd
253    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
254
255    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
256    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
257
258    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
259    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
260
261    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
262    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
263
264    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
265
266    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
267    vst1.8      {d12},[r14],r6
268
269@   and         r11, r0, #31
270    vqrshrun.s16 d14,q7,#6
271
272    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
273    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
274
275    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
276    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
277
278    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
279
280    vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
281    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
282
283    vst1.8      {d14},[r14],r6
284    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
285
286    add         r14,r1,#0
287    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
288
289    add         r1, r1, #8
290    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
291
292    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
293
294    addle       r1,r1,r9
295    vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
296
297@   cmp         r11, r10
298    vmull.u8    q6,d3,d23
299
300    add         r10, r3, r2, lsl #3         @ 10*strd - 8+2
301    vmlsl.u8    q6,d2,d22
302
303    add         r10, r10, r2                @ 11*strd
304    vmlsl.u8    q6,d4,d24
305
306    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
307    vmlal.u8    q6,d5,d25
308
309    vmlal.u8    q6,d6,d26
310    vst1.8      {d8},[r14],r6               @vst1_u8(pu1_dst,sto_res)@
311
312    pld         [r10]                       @11+ 0
313    vmlsl.u8    q6,d7,d27
314
315    pld         [r10, r2]                   @11+ 1*strd
316    vmlal.u8    q6,d16,d28
317
318    pld         [r10, r2, lsl #1]           @11+ 2*strd
319    vmlsl.u8    q6,d17,d29
320
321    add         r10, r10, r2                @12*strd
322    vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
323
324    pld         [r10, r2, lsl #1]           @11+ 3*strd
325    vmull.u8    q7,d4,d23
326
327@   mov         r10, r11
328    vmlsl.u8    q7,d3,d22
329
330    subs        r7,r7,#4
331    vmlsl.u8    q7,d5,d24
332
333    vmlal.u8    q7,d6,d25
334    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
335    vmlal.u8    q7,d7,d26
336    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
337    vmlsl.u8    q7,d16,d27
338    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
339    vmlal.u8    q7,d17,d28
340    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
341    vmlsl.u8    q7,d18,d29
342    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
343
344    vqrshrun.s16 d12,q6,#6
345    vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
346
347
348
349    bgt         kernel_8                    @jumps to kernel_8
350
351epilog:
352
353    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
354    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
355    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
356    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
357    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
358    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
359    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
360    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
361    vst1.8      {d12},[r14],r6
362
363    vqrshrun.s16 d14,q7,#6
364
365    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
366    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
367    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
368    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
369    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
370    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
371    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
372    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
373    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
374    vst1.8      {d14},[r14],r6
375
376    vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
377
378    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
379    vmull.u8    q6,d3,d23
380    vmlsl.u8    q6,d2,d22
381    vmlsl.u8    q6,d4,d24
382    vmlal.u8    q6,d5,d25
383    vmlal.u8    q6,d6,d26
384    vmlsl.u8    q6,d7,d27
385    vmlal.u8    q6,d16,d28
386    vmlsl.u8    q6,d17,d29
387    add         r14,r1,r6
388    vst1.8      {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
389    vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
390
391    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
392    vmull.u8    q7,d4,d23
393    vmlsl.u8    q7,d3,d22
394    vmlsl.u8    q7,d5,d24
395    vmlal.u8    q7,d6,d25
396    vmlal.u8    q7,d7,d26
397    vmlsl.u8    q7,d16,d27
398    vmlal.u8    q7,d17,d28
399    vmlsl.u8    q7,d18,d29
400
401    vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
402    vqrshrun.s16 d12,q6,#6
403
404epilog_end:
405    vst1.8      {d12},[r14],r6
406    vqrshrun.s16 d14,q7,#6
407
408    vst1.8      {d14},[r14],r6
409
410
411end_loops:
412    tst         r5,#7
413    ldr         r1, [sp], #4
414    ldr         r0, [sp], #4
415
416    beq         end1
417
418    mov         r5, #4
419    add         r0, r0, #8
420    add         r1, r1, #8
421    mov         r7, #16
422    @
423
424core_loop_wd_4:
425    rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
426    rsb         r8,r5,r2,lsl #2             @r2->src_strd
427    vmov.i8     d4,#0
428
429outer_loop_wd_4:
430    subs        r12,r5,#0
431    ble         end_inner_loop_wd_4         @outer loop jump
432
433inner_loop_wd_4:
434    add         r3,r0,r2
435    vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
436    subs        r12,r12,#4
437    vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
438    vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
439    vld1.u32    {d4[0]},[r0]                @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
440    vmull.u8    q0,d5,d23                   @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
441
442    vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
443    add         r0,r0,#4
444    vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
445    vmlsl.u8    q0,d4,d22                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
446
447    vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
448    vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
449    vmlsl.u8    q0,d6,d24                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
450
451    vmull.u8    q4,d7,d23
452    vdup.u32    d4,d7[1]                    @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
453    vmull.u8    q1,d7,d25                   @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
454    vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
455    vmlsl.u8    q4,d6,d22
456    vmlal.u8    q0,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
457
458    vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
459    vmlsl.u8    q4,d4,d24
460    vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
461    vmlsl.u8    q1,d5,d27                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
462
463    vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
464    vmlal.u8    q4,d5,d25
465    vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
466    vmlal.u8    q0,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
467
468    vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
469    vmlal.u8    q4,d6,d26
470    vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
471    vmlsl.u8    q1,d7,d29                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
472
473    vdup.u32    d4,d7[1]
474    vadd.i16    q0,q0,q1                    @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
475
476    vmlsl.u8    q4,d7,d27
477    vld1.u32    {d4[1]},[r3],r2
478    vmlal.u8    q4,d4,d28
479    vdup.u32    d5,d4[1]
480    vqrshrun.s16 d0,q0,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
481
482    vld1.u32    {d5[1]},[r3]
483    add         r3,r1,r6
484    vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
485
486    vmlsl.u8    q4,d5,d29
487    vst1.32     {d0[1]},[r3],r6             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
488    vqrshrun.s16 d8,q4,#6
489
490    vst1.32     {d8[0]},[r3],r6
491    add         r1,r1,#4
492    vst1.32     {d8[1]},[r3]
493    bgt         inner_loop_wd_4
494
495end_inner_loop_wd_4:
496    subs        r7,r7,#4
497    add         r1,r1,r9
498    add         r0,r0,r8
499    bgt         outer_loop_wd_4
500
501end1:
502    vpop         {d8 - d15}
503    ldmfd       sp!, {r4-r12, r15}          @reload the registers from sp
504
505
506
507@/**
508@*******************************************************************************
509@*
510@* @brief
511@*     interprediction luma filter for vertical 16bit output
512@*
513@* @par description:
514@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
515@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
516@*    'pu1_dst'  no downshifting or clipping is done and the output is  used as
517@*    an input for weighted prediction   assumptions : the function is optimized
518@*    considering the fact width is  multiple of 4 or 8. and height as multiple
519@*    of 2.
520@*
521@* @param[in] pu1_src
522@*  uword8 pointer to the source
523@*
524@* @param[out] pi2_dst
525@*  word16 pointer to the destination
526@*
527@* @param[in] src_strd
528@*  integer source stride
529@*
530@* @param[in] dst_strd
531@*  integer destination stride
532@*
533@* @param[in] pi1_coeff
534@*  word8 pointer to the filter coefficients
535@*
536@* @param[in] ht
537@*  integer height of the array
538@*
539@* @param[in] wd
540@*  integer width of the array
541@*
542@* @returns
543@*
544@* @remarks
545@*  none
546@*
547@*******************************************************************************
548@*/
549
550@void ihevc_inter_pred_luma_vert_w16out(uword8 *pu1_src,
551@                                    word16 *pi2_dst,
552@                                    word32 src_strd,
553@                                    word32 dst_strd,
554@                                    word8 *pi1_coeff,
555@                                    word32 ht,
556@                                    word32 wd   )
557
558@**************variables vs registers*****************************************
559@   r0 => *pu1_src
560@   r1 => *pu1_dst
561@   r2 =>  src_strd
562@   r6 =>  dst_strd
563@   r12 => *pi1_coeff
564@   r5 =>  ht
565@   r3 =>  wd
566
567
568
569.globl ihevc_inter_pred_luma_vert_w16out_a9q
570
571.type ihevc_inter_pred_luma_vert_w16out_a9q, %function
572
573ihevc_inter_pred_luma_vert_w16out_a9q:
574
575    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
576    vpush        {d8 - d15}
577
578    ldr         r12,[sp,#coeff_offset]                @load pi1_coeff
579    mov         r6,r3
580    ldr         r5,[sp,#wd_offset]                 @load wd
581    vld1.u8     {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
582    sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
583    vabs.s8     d0,d0                       @vabs_s8(coeff)
584    add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
585    ldr         r3,[sp,#ht_offset]                 @load ht
586    subs        r7,r3,#0                    @r3->ht
587    @ble        end_loops_16out         @end loop jump
588    vdup.u8     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
589    cmp         r5,#8
590    vdup.u8     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
591    vdup.u8     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
592    vdup.u8     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
593    vdup.u8     d26,d0[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
594    vdup.u8     d27,d0[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
595    vdup.u8     d28,d0[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
596    vdup.u8     d29,d0[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
597    blt         core_loop_wd_4_16out        @core loop wd 4 jump
598    str         r0, [sp, #-4]!
599    str         r1, [sp, #-4]!
600
601    bic         r4,r5,#7                    @r5 ->wd
602    rsb         r9,r4,r6,lsl #2             @r6->dst_strd   r5  ->wd
603    rsb         r8,r4,r2,lsl #2             @r2->src_strd
604    mov         r6, r6, lsl #1
605    mov         r3, r5, lsr #3              @divide by 8
606    mul         r7, r3                      @multiply height by width
607    sub         r7, #4                      @subtract by one for epilog
608
609prolog_16out:
610
611    and         r10, r0, #31
612    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
613
614    vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
615    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
616    subs        r4,r4,#8
617    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
618    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
619    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
620    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
621    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
622    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
623    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
624    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
625    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
626    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
627    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
628    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
629    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
630    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
631    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
632    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
633
634
635    addle       r0,r0,r8
636    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
637
638    bicle       r4,r5,#7                    @r5 ->wd
639    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
640
641    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
642    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
643
644    pld         [r3]
645    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
646    pld         [r3, r2]
647    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
648    pld         [r3, r2, lsl #1]
649    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
650    add         r3, r3, r2
651    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
652    pld         [r3, r2, lsl #1]
653    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
654
655    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
656    vmull.u8    q6,d3,d23
657    vld1.u8     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
658    vmlsl.u8    q6,d2,d22
659    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
660    vmlsl.u8    q6,d4,d24
661    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
662    vmlal.u8    q6,d5,d25
663    vmlal.u8    q6,d6,d26
664    vmlsl.u8    q6,d7,d27
665    vmlal.u8    q6,d16,d28
666    vmlsl.u8    q6,d17,d29
667    add         r14,r1,r6
668    vst1.8      {d8, d9},[r1]!              @vst1_u8(pu1_dst,sto_res)@
669    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
670    addle       r1,r1,r9,lsl #1
671
672    vmull.u8    q7,d4,d23
673    subs        r7,r7,#4
674    vmlsl.u8    q7,d3,d22
675    vmlsl.u8    q7,d5,d24
676    vmlal.u8    q7,d6,d25
677    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
678    vmlal.u8    q7,d7,d26
679    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
680    vmlsl.u8    q7,d16,d27
681    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
682    vmlal.u8    q7,d17,d28
683    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
684    vmlsl.u8    q7,d18,d29
685    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
686
687    vst1.8      {d10, d11},[r14],r6         @vst1_u8(pu1_dst_tmp,sto_res)@
688    @vqrshrun.s16 d12,q6,#6
689
690
691    blt         epilog_end_16out
692    beq         epilog_16out                @jumps to epilog
693
694kernel_8_16out:
695
696    subs        r4,r4,#8
697    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
698
699    addle       r0,r0,r8
700    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
701
702    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
703    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
704
705    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
706    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
707
708    bicle       r4,r5,#7                    @r5 ->wd
709    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
710
711    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
712    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
713
714    vst1.8      {d12,d13},[r14],r6
715    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
716
717    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
718    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
719
720
721@   and         r11, r0, #31
722    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
723
724    vst1.8      {d14,d15},[r14],r6
725    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
726
727    add         r14,r1,r6
728    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
729
730    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
731    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
732
733    vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
734    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
735
736    vst1.8      {d8,d9},[r1]!               @vst1_u8(pu1_dst,sto_res)@
737    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
738
739    addle       r1,r1,r9,lsl #1
740    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
741
742@   cmp         r11, r10
743    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
744
745    add         r10, r3, r2, lsl #3         @ 10*strd - 8+2
746    vmull.u8    q6,d3,d23
747
748    add         r10, r10, r2                @ 11*strd
749    vmlsl.u8    q6,d2,d22
750
751    pld         [r10]                       @11+ 0
752    vmlsl.u8    q6,d4,d24
753
754    pld         [r10, r2]                   @11+ 1*strd
755    vmlal.u8    q6,d5,d25
756
757    pld         [r10, r2, lsl #1]           @11+ 2*strd
758    vmlal.u8    q6,d6,d26
759
760    add         r10, r10, r2                @12*strd
761    vmlsl.u8    q6,d7,d27
762
763    pld         [r10, r2, lsl #1]           @11+ 3*strd
764    vmlal.u8    q6,d16,d28
765
766@   mov         r10, r11
767    vmlsl.u8    q6,d17,d29
768
769    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
770    vmull.u8    q7,d4,d23
771
772    subs        r7,r7,#4
773    vmlsl.u8    q7,d3,d22
774
775    vst1.8      {d10, d11},[r14],r6         @vst1_u8(pu1_dst_tmp,sto_res)@
776    vmlsl.u8    q7,d5,d24
777
778    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
779    vmlal.u8    q7,d6,d25
780
781    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
782    vmlal.u8    q7,d7,d26
783
784    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
785    vmlsl.u8    q7,d16,d27
786
787    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
788    vmlal.u8    q7,d17,d28
789
790    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
791    vmlsl.u8    q7,d18,d29
792
793
794    bgt         kernel_8_16out              @jumps to kernel_8
795
796epilog_16out:
797
798    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
799    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
800    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
801    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
802    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
803    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
804    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
805    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
806    vst1.8      {d12,d13},[r14],r6
807
808    @vqrshrun.s16 d14,q7,#6
809
810    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
811    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
812    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
813    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
814    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
815    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
816    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
817    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
818    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
819    vst1.8      {d14,d15},[r14],r6
820
821    @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
822
823    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
824    vmull.u8    q6,d3,d23
825    vmlsl.u8    q6,d2,d22
826    vmlsl.u8    q6,d4,d24
827    vmlal.u8    q6,d5,d25
828    vmlal.u8    q6,d6,d26
829    vmlsl.u8    q6,d7,d27
830    vmlal.u8    q6,d16,d28
831    vmlsl.u8    q6,d17,d29
832    add         r14,r1,r6
833    vst1.8      {d8,d9},[r1]!               @vst1_u8(pu1_dst,sto_res)@
834    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
835
836    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
837    vmull.u8    q7,d4,d23
838    vmlsl.u8    q7,d3,d22
839    vmlsl.u8    q7,d5,d24
840    vmlal.u8    q7,d6,d25
841    vmlal.u8    q7,d7,d26
842    vmlsl.u8    q7,d16,d27
843    vmlal.u8    q7,d17,d28
844    vmlsl.u8    q7,d18,d29
845
846    vst1.8      {d10,d11},[r14],r6          @vst1_u8(pu1_dst_tmp,sto_res)@
847    @vqrshrun.s16 d12,q6,#6
848
849epilog_end_16out:
850    vst1.8      {d12,d13},[r14],r6
851    @vqrshrun.s16 d14,q7,#6
852
853    vst1.8      {d14,d15},[r14],r6
854
855
856end_loops_16out:
857    tst         r5,#7
858    ldr         r1, [sp], #4
859    ldr         r0, [sp], #4
860
861    beq         end2
862
863    mov         r5, #4
864    add         r0, r0, #8
865    add         r1, r1, #16
866    mov         r7, #16
867    mov         r6, r6, lsr #1
868
869    @
870
871core_loop_wd_4_16out:
872    rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
873    rsb         r8,r5,r2,lsl #2             @r2->src_strd
874    vmov.i8     d4,#0
875    mov         r6, r6, lsl #1
876
877outer_loop_wd_4_16out:
878    subs        r12,r5,#0
879    ble         end_inner_loop_wd_4_16out   @outer loop jump
880
881inner_loop_wd_4_16out:
882    add         r3,r0,r2
883    vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
884    subs        r12,r12,#4
885    vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
886    vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
887    vld1.u32    {d4[0]},[r0]                @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
888    vmull.u8    q0,d5,d23                   @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
889
890    vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
891    add         r0,r0,#4
892    vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
893    vmlsl.u8    q0,d4,d22                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
894
895    vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
896    vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
897    vmlsl.u8    q0,d6,d24                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
898
899    vmull.u8    q4,d7,d23
900    vdup.u32    d4,d7[1]                    @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
901    vmull.u8    q1,d7,d25                   @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
902    vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
903    vmlsl.u8    q4,d6,d22
904    vmlal.u8    q0,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
905
906    vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
907    vmlsl.u8    q4,d4,d24
908    vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
909    vmlsl.u8    q1,d5,d27                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
910
911    vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
912    vmlal.u8    q4,d5,d25
913    vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
914    vmlal.u8    q0,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
915
916    vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
917    vmlal.u8    q4,d6,d26
918    vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
919    vmlsl.u8    q1,d7,d29                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
920
921    vdup.u32    d4,d7[1]
922    vadd.i16    q0,q0,q1                    @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
923
924    vmlsl.u8    q4,d7,d27
925    vld1.u32    {d4[1]},[r3],r2
926    vmlal.u8    q4,d4,d28
927    vdup.u32    d5,d4[1]
928    @vqrshrun.s16 d0,q0,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
929
930    vld1.u32    {d5[1]},[r3]
931    add         r3,r1,r6
932    vst1.32     {d0},[r1]!                  @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
933
934    vmlsl.u8    q4,d5,d29
935    vst1.32     {d1},[r3],r6                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
936    @vqrshrun.s16 d8,q4,#6
937
938    vst1.32     {d8},[r3],r6
939    @add        r1,r1,#4
940    vst1.32     {d9},[r3]
941    bgt         inner_loop_wd_4_16out
942
943end_inner_loop_wd_4_16out:
944    subs        r7,r7,#4
945    add         r1,r1,r9,lsl #1
946    add         r0,r0,r8
947    bgt         outer_loop_wd_4_16out
948end2:
949    vpop         {d8 - d15}
950    ldmfd       sp!, {r4-r12, r15}          @reload the registers from sp
951
952
953
954
955
956
957
958
959
960