1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19
20@/**
21@******************************************************************************
22@* @file
23@*  ihevc_inter_pred_luma_horz_w16out.s
24@*
25@* @brief
26@*  contains function definitions for inter prediction  interpolation.
27@* functions are coded using neon  intrinsics and can be compiled using
28
29@* rvct
30@*
31@* @author
32@*  parthiban v
33@*
34@* @par list of functions:
35@*
36@*  - ihevc_inter_pred_luma_horz_w16out()
37@*
38@* @remarks
39@*  none
40@*
41@*******************************************************************************
42@*/
43@/**
44@*******************************************************************************
45@*
46@* @brief
47@*   interprediction luma filter for horizontal 16bit output
48@*
49@* @par description:
50@*     applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
51@*     to the elements pointed by 'pu1_src' and  writes to the location pointed
52@*     by 'pu1_dst'  no downshifting or clipping is done and the output is  used
53@*     as an input for vertical filtering or weighted  prediction   assumptions :
54@*     the function is optimized considering the fact width is  multiple of 4 or
55@*     8. if width is multiple of 4 then height  should be multiple of 2, width 8
56@*     is optimized further.
57@*
58@* @param[in] pu1_src
59@*  uword8 pointer to the source
60@*
61@* @param[out] pi2_dst
62@*  word16 pointer to the destination
63@*
64@* @param[in] src_strd
65@*  integer source stride
66@*
67@* @param[in] dst_strd
68@*  integer destination stride
69@*
70@* @param[in] pi1_coeff
71@*  word8 pointer to the filter coefficients
72@*
73@* @param[in] ht
74@*  integer height of the array
75@*
76@* @param[in] wd
77@*  integer width of the array
78@*
79@* @returns
80@*
81@* @remarks
82@*  none
83@*
84@*******************************************************************************
85@*/
86
87@void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
88@                                word16 *pi2_dst,
89@                                word32 src_strd,
90@                                word32 dst_strd,
91@                                word8 *pi1_coeff,
92@                                word32 ht,
93@                                word32 wd
94
95
96@r0 - free
97@r1 - dst_ptr
98@r2 - src_strd
99@r3 - dst_strd
100@r4 - src_ptr2
101@r5 - inner loop counter
102@r6 - dst_ptr2
103@r7 - free
104@r8 - dst_strd2
105@r9 - src_strd1
106@r10 - wd
107@r11 - #1
108@r12 - src_ptr1
109@r14 - loop_counter
110.text
111.align 4
112.syntax unified
113
114
115
116
117.globl ihevc_inter_pred_luma_horz_w16out_a9q
118
119.type ihevc_inter_pred_luma_horz_w16out_a9q, %function
120
121ihevc_inter_pred_luma_horz_w16out_a9q:
122
123    bic         r14, #1                     @ clearing bit[0], so that it goes back to mode
124    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
125    ldr         r4,[sp,#40]                 @loads pi1_coeff
126    ldr         r7,[sp,#44]                 @loads ht
127
128
129    vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
130    sub         r14,r7,#0                   @checks for ht == 0
131    vabs.s8     d2,d0                       @vabs_s8(coeff)
132    mov         r11,#1
133    @ble       end_loops
134    ldr         r10,[sp,#48]                @loads wd
135    vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
136    sub         r12,r0,#3                   @pu1_src - 3
137    vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
138    add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
139    vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
140    rsb         r9,r10,r2,lsl #1            @2*src_strd - wd
141    vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
142    rsb         r8,r10,r3                   @dst_strd - wd
143    vdup.8      d28,d2[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)
144
145    vdup.8      d29,d2[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
146    and         r7,r14,#1                   @calculating ht_residue ht_residue = (ht & 1)
147    vdup.8      d30,d2[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
148    sub         r14,r14,r7                  @decrement height by ht_residue(residue value is calculated outside)
149    vdup.8      d31,d2[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)
150
151    cmp         r7,#1
152    beq         odd_height_decision
153
154even_height_decision:
155    mov         r7,r1
156    cmp         r10,#4
157    ble         outer_loop_4
158
159    cmp         r10,#24
160    moveq       r10,#16
161    addeq       r8,#8
162    addeq       r9,#8
163
164    cmp         r10,#16
165    bge         outer_loop_16_branch
166
167    cmp         r10,#12
168    addeq       r8,#4
169    addeq       r9,#4
170outer_loop_8_branch:
171    b           outer_loop_8
172
173outer_loop_16_branch:
174    b           outer_loop_16
175
176
177odd_height_decision:
178    cmp         r10,#24
179    beq         outer_loop_8_branch
180    cmp         r10,#12
181    beq         outer_loop_4
182    b           even_height_decision
183
184outer_loop4_residual:
185    sub         r12,r0,#3                   @pu1_src - 3
186    mov         r1,r7
187    add         r1,#16
188    mov         r10,#4
189    add         r12,#8
190    mov         r14,#16
191    add         r8,#4
192    add         r9,#4
193
194outer_loop_4:
195    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
196    add         r4,r12,r2                   @pu1_src + src_strd
197
198    subs        r5,r10,#0                   @checks wd
199    ble         end_inner_loop_4
200
201inner_loop_4:
202    vld1.u32    {d0},[r12],r11              @vector load pu1_src
203    vld1.u32    {d1},[r12],r11
204    vld1.u32    {d2},[r12],r11
205    vld1.u32    {d3},[r12],r11
206    vld1.u32    {d4},[r12],r11
207    vld1.u32    {d5},[r12],r11
208    vld1.u32    {d6},[r12],r11
209    vld1.u32    {d7},[r12],r11
210    @add       r12,r12,#4                      @increment the input pointer
211    sub         r12,r12,#4
212    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
213    @vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
214    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
215
216    @vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
217    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
218    @vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
219    @vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
220    vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
221    vld1.u32    {d13},[r4],r11
222    vzip.32     d0,d12                      @vector zip the i iteration and ii interation in single register
223    vld1.u32    {d14},[r4],r11
224    vzip.32     d1,d13
225    vld1.u32    {d15},[r4],r11
226    vzip.32     d2,d14
227    vld1.u32    {d16},[r4],r11
228    vzip.32     d3,d15
229    vld1.u32    {d17},[r4],r11
230    vzip.32     d4,d16
231    vld1.u32    {d18},[r4],r11
232    vzip.32     d5,d17
233    vld1.u32    {d19},[r4],r11
234    sub         r4,r4,#4
235    @ add       r4,r4,#4                        @increment the input pointer
236    @ vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
237    @ vext.u8   d15,d12,d13,#3                  @vector extract of src[0_3]
238    @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
239    @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
240    @ vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
241    @ vext.u8   d19,d12,d13,#7                  @vector extract of src[0_7]
242    @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
243
244
245
246
247
248
249
250    vzip.32     d6,d18
251    vzip.32     d7,d19
252
253    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
254    vmlsl.u8    q4,d0,d24
255    vmlsl.u8    q4,d2,d26
256    vmlal.u8    q4,d3,d27
257    vmlal.u8    q4,d4,d28
258    vmlsl.u8    q4,d5,d29
259    vmlal.u8    q4,d6,d30
260    vmlsl.u8    q4,d7,d31
261
262    @ vqrshrun.s16 d8,q4,#6                     @narrow right shift and saturating the result
263    vst1.64     {d8},[r1]!                  @store the i iteration result which is in upper part of the register
264    vst1.64     {d9},[r6]!                  @store the ii iteration result which is in lower part of the register
265    subs        r5,r5,#4                    @decrement the wd by 4
266    bgt         inner_loop_4
267
268end_inner_loop_4:
269    subs        r14,r14,#2                  @decrement the ht by 4
270    add         r12,r12,r9                  @increment the input pointer 2*src_strd-wd
271    add         r1,r6,r8,lsl #1             @increment the output pointer 2*dst_strd-wd
272    bgt         outer_loop_4
273
274
275height_residue_4:
276
277    ldr         r7,[sp,#44]                 @loads ht
278    and         r7,r7,#1                    @calculating ht_residue ht_residue = (ht & 1)
279    cmp         r7,#0
280    @beq        end_loops
281    ldmfdeq     sp!,{r4-r12,r15}            @reload the registers from sp
282
283outer_loop_height_residue_4:
284
285
286    subs        r5,r10,#0                   @checks wd
287    ble         end_inner_loop_height_residue_4
288
289inner_loop_height_residue_4:
290    vld1.u32    {d0},[r12],r11              @vector load pu1_src
291    vld1.u32    {d1},[r12],r11
292
293
294
295
296
297
298    @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
299    @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
300    @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
301
302
303
304    @add        r12,r12,#4                      @increment the input pointer
305    @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
306    @ vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
307    @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
308    @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
309    vld1.u32    {d2},[r12],r11
310    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
311    vld1.u32    {d3},[r12],r11
312    vmlsl.u8    q4,d0,d24
313    vld1.u32    {d4},[r12],r11
314    vmlsl.u8    q4,d2,d26
315    vld1.u32    {d5},[r12],r11
316    vmlal.u8    q4,d3,d27
317    vld1.u32    {d6},[r12],r11
318    vmlal.u8    q4,d4,d28
319    vld1.u32    {d7},[r12],r11
320    vmlsl.u8    q4,d5,d29
321    sub         r12,r12,#4
322    vmlal.u8    q4,d6,d30
323    vmlsl.u8    q4,d7,d31                   @store the i iteration result which is in upper part of the register
324    subs        r5,r5,#4                    @decrement the wd by 4
325    vst1.64     {d8},[r1]!
326    bgt         inner_loop_height_residue_4
327
328end_inner_loop_height_residue_4:
329    subs        r7,r7,#1                    @decrement the ht by 4
330    rsb         r9,r10,r2
331    add         r12,r12,r9                  @increment the input pointer src_strd-wd
332    add         r1,r1,r8                    @increment the output pointer dst_strd-wd
333    bgt         outer_loop_height_residue_4
334
335    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
336
337outer_loop8_residual:
338    sub         r12,r0,#3                   @pu1_src - 3
339    mov         r1,r7
340    mov         r14,#32
341    add         r1,#32
342    add         r12,#16
343    mov         r10,#8
344    add         r8,#8
345    add         r9,#8
346
347outer_loop_8:
348
349    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
350    add         r4,r12,r2                   @pu1_src + src_strd
351    subs        r5,r10,#0                   @checks wd
352
353    ble         end_inner_loop_8
354
355inner_loop_8:
356    vld1.u32    {d0},[r12],r11              @vector load pu1_src
357    vld1.u32    {d1},[r12],r11
358    vld1.u32    {d2},[r12],r11
359    vld1.u32    {d3},[r12],r11
360
361
362
363
364
365    @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
366    @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
367    @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
368    @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
369    @ vext.u8   d6,d0,d1,#6                     @vector extract of src [0_6]
370    @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
371    @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
372    @ vext.u8   d14,d12,d13,#2
373
374    @vext.u8    d15,d12,d13,#3                  @vector extract of src[0_3]
375    @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
376    @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
377    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
378    @vext.u8    d19,d12,d13,#7                  @vector extract of src[0_7]
379    @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
380    vld1.u32    {d4},[r12],r11
381    vmull.u8    q4,d1,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
382    vld1.u32    {d5},[r12],r11
383    vmlal.u8    q4,d3,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
384    vld1.u32    {d6},[r12],r11
385    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
386    vld1.u32    {d7},[r12],r11
387    vmlsl.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
388    vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
389    vmlal.u8    q4,d4,d28                   @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
390    vld1.u32    {d13},[r4],r11
391    vmlsl.u8    q4,d5,d29                   @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
392    vld1.u32    {d14},[r4],r11
393    vmlal.u8    q4,d6,d30                   @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
394    vld1.u32    {d15},[r4],r11
395    vmlsl.u8    q4,d7,d31                   @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
396    vld1.u32    {d16},[r4],r11              @vector load pu1_src + src_strd
397
398    vmull.u8    q5,d15,d27                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
399    vld1.u32    {d17},[r4],r11
400    vmlsl.u8    q5,d14,d26                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
401    vld1.u32    {d18},[r4],r11
402    vmlal.u8    q5,d16,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
403    vld1.u32    {d19},[r4],r11              @vector load pu1_src + src_strd
404    vmlsl.u8    q5,d17,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
405    @ vqrshrun.s16  d20,q4,#6                       @right shift and saturating narrow result 1
406    vmlal.u8    q5,d18,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
407    vmlsl.u8    q5,d19,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
408    vst1.16     {q4},[r1]!                  @store the result pu1_dst
409    vmlsl.u8    q5,d12,d24                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
410    vmlal.u8    q5,d13,d25                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
411
412
413
414    @ vqrshrun.s16 d8,q5,#6                     @right shift and saturating narrow result 2
415    subs        r5,r5,#8                    @decrement the wd loop
416    vst1.16     {q5},[r6]!                  @store the result pu1_dst
417    cmp         r5,#4
418    bgt         inner_loop_8
419
420end_inner_loop_8:
421    subs        r14,r14,#2                  @decrement the ht loop
422    add         r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
423    add         r1,r6,r8,lsl #1             @increment the dst pointer by 2*dst_strd-wd
424    bgt         outer_loop_8
425
426
427
428
429
430    ldr         r10,[sp,#48]                @loads wd
431    cmp         r10,#12
432
433    beq         outer_loop4_residual
434
435    ldr         r7,[sp,#44]                 @loads ht
436    and         r7,r7,#1
437    cmp         r7,#1
438    beq         height_residue_4
439
440@end_loops
441
442    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
443
444
445
446
447
448outer_loop_16:
449    str         r0, [sp, #-4]!
450    str         r7, [sp, #-4]!
451    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
452    add         r4,r12,r2                   @pu1_src + src_strd
453    and         r0, r12, #31
454    sub         r5,r10,#0                   @checks wd
455    @ble       end_loops1
456    pld         [r12, r2, lsl #1]
457    vld1.u32    {q0},[r12],r11              @vector load pu1_src
458    pld         [r4, r2, lsl #1]
459    vld1.u32    {q1},[r12],r11
460    vld1.u32    {q2},[r12],r11
461    vld1.u32    {q3},[r12],r11
462    vld1.u32    {q6},[r12],r11
463    vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
464    vld1.u32    {q7},[r12],r11
465    vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
466    vld1.u32    {q8},[r12],r11
467    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
468    vld1.u32    {q9},[r12],r11
469    vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
470    vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
471    vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
472    vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
473    vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
474
475
476inner_loop_16:
477
478
479    subs        r5,r5,#16
480    vmull.u8    q10,d3,d25
481
482    add         r12,#8
483    vmlsl.u8    q10,d1,d24
484
485    vld1.u32    {q0},[r4],r11               @vector load pu1_src
486    vmlal.u8    q10,d7,d27
487
488    vld1.u32    {q1},[r4],r11
489    vmlsl.u8    q10,d5,d26
490
491    vld1.u32    {q2},[r4],r11
492    vmlal.u8    q10,d13,d28
493
494    vld1.u32    {q3},[r4],r11
495    vmlal.u8    q10,d17,d30
496
497    vld1.u32    {q6},[r4],r11
498    vmlsl.u8    q10,d15,d29
499
500    vld1.u32    {q7},[r4],r11
501    vmlsl.u8    q10,d19,d31
502
503    vld1.u32    {q8},[r4],r11
504    vmull.u8    q5,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
505
506    vld1.u32    {q9},[r4],r11
507    vmlal.u8    q5,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
508
509    add         r4,#8
510    vmlsl.u8    q5,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
511    pld         [r12, r2, lsl #2]
512    pld         [r4, r2, lsl #2]
513    vst1.8      {q4},[r1]!                  @store the result pu1_dst
514    vmlsl.u8    q5,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
515
516    addeq       r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
517    vmlal.u8    q5,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
518
519    addeq       r4,r12,r2                   @pu1_src + src_strd
520    vmlsl.u8    q5,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
521
522@   and         r7, r12, #31
523    vmlal.u8    q5,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
524
525    subeq       r14,r14,#2
526    vmlsl.u8    q5,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
527
528    @cmp            r7, r0
529    vmull.u8    q11,d3,d25
530
531@   pld     [r12, r2, lsl #2]
532    vmlsl.u8    q11,d1,d24
533
534    vst1.16     {q10},[r1]!
535    vmlal.u8    q11,d7,d27
536
537@   pld     [r4, r2, lsl #2]
538    vmlsl.u8    q11,d5,d26
539
540@   mov         r0, r7
541    vmlal.u8    q11,d13,d28
542
543    cmp         r14,#0
544    vmlal.u8    q11,d17,d30
545
546    vst1.16     {q5},[r6]!
547    vmlsl.u8    q11,d15,d29
548
549    vmlsl.u8    q11,d19,d31
550
551    beq         epilog_16
552
553    vld1.u32    {q0},[r12],r11              @vector load pu1_src
554    vld1.u32    {q1},[r12],r11
555    vld1.u32    {q2},[r12],r11
556    vld1.u32    {q3},[r12],r11
557    vld1.u32    {q6},[r12],r11
558    vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
559    vld1.u32    {q7},[r12],r11
560    vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
561    vld1.u32    {q8},[r12],r11
562    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
563    vld1.u32    {q9},[r12],r11
564    vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
565    vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
566    cmp         r5,#0
567    vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
568    moveq       r5,r10
569    vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
570    vst1.8      {q11},[r6]!                 @store the result pu1_dst
571    vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
572    addeq       r1,r6,r8,lsl #1
573    addeq       r6,r1,r3,lsl #1             @pu1_dst + dst_strd
574    b           inner_loop_16
575
576
577epilog_16:
578@   vqrshrun.s16 d11,q11,#6
579    vst1.8      {q11},[r6]!                 @store the result pu1_dst
580
581    ldr         r7, [sp], #4
582    ldr         r0, [sp], #4
583    ldr         r10,[sp,#48]
584    cmp         r10,#24
585    beq         outer_loop8_residual
586    add         r1,r6,r8,lsl #1
587    ldr         r7,[sp,#44]                 @loads ht
588    and         r7,r7,#1
589    cmp         r7,#1
590    beq         height_residue_4
591
592end_loops1:
593
594    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
595
596
597
598
599
600
601
602
603
604