1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_inter_pred_chroma_horz_neon.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs / akshaya mukund
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*       chroma interprediction filter to store horizontal 16bit ouput
45@*
46@* @par description:
47@*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
48@*    to the elements pointed by 'pu1_src' and  writes to the location pointed
49@*    by 'pu1_dst'  no downshifting or clipping is done and the output is  used
50@*    as an input for vertical filtering or weighted  prediction
51@*
52@* @param[in] pu1_src
53@*  uword8 pointer to the source
54@*
55@* @param[out] pi2_dst
56@*  word16 pointer to the destination
57@*
58@* @param[in] src_strd
59@*  integer source stride
60@*
61@* @param[in] dst_strd
62@*  integer destination stride
63@*
64@* @param[in] pi1_coeff
65@*  word8 pointer to the filter coefficients
66@*
67@* @param[in] ht
68@*  integer height of the array
69@*
70@* @param[in] wd
71@*  integer width of the array
72@*
73@* @returns
74@*
75@* @remarks
76@*  none
77@*
78@*******************************************************************************
79@*/
80@void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src,
81@                                          word16 *pi2_dst,
82@                                          word32 src_strd,
83@                                          word32 dst_strd,
84@                                          word8 *pi1_coeff,
85@                                          word32 ht,
86@                                          word32 wd)
87@**************variables vs registers*****************************************
88@r0 => *pu1_src
89@r1 => *pi2_dst
90@r2 =>  src_strd
91@r3 =>  dst_strd
92
93
94.text
95.align 4
96
97
98
99
100.globl ihevc_inter_pred_chroma_horz_w16out_a9q
101
102
103.type ihevc_inter_pred_chroma_horz_w16out_a9q, %function
104
105ihevc_inter_pred_chroma_horz_w16out_a9q:
106
107    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
108
109    ldr         r4,[sp,#40]                 @loads pi1_coeff
110    ldr         r6,[sp,#44]                 @loads ht
111    ldr         r10,[sp,#48]                @loads wd
112
113    vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
114    subs        r14,r6,#0                   @checks for ht == 0
115    vabs.s8     d2,d0                       @vabs_s8(coeff)
116
117@******* added
118    mov         r11, #2
119@******* added ends
120
121    ble         end_loops
122
123    vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
124    sub         r12,r0,#2                   @pu1_src - 2
125    vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
126    add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
127    vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
128
129    tst         r10,#3                      @checks wd for multiples of 4
130    mov         r5,r10,lsl #1               @2wd
131
132    vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
133
134    and         r7,r14,#1                   @added              @calculating ht_residue ht_residue = (ht & 1)
135    sub         r14,r14,r7                  @added              @decrement height by ht_residue(residue value is calculated outside)
136
137    bne         outer_loop_4                @ this branching happens when the width is 2 or 6
138
139    cmp         r10,#12
140    beq         skip_16
141
142    cmp         r10,#8
143    bge         outer_loop_16
144
145skip_16:
146    tst         r6,#3
147
148@******* removal
149    @mov       r11,#8
150@******* removal ends
151
152    sub         r9,r0,#2
153    beq         outer_loop_ht_4             @this branching happens when the height is a a multiple of 4
154
155
156
157@    cmp        r10,#12
158@    beq    outer_loop_8
159@    cmp        r10,#16
160@    bge    outer_loop_16
161    b           outer_loop_8
162
163
164
165outer_loop_16:
166    add         r4,r12,r2
167
168
169    and         r0, r12, #31
170    pld         [r12, r2, lsl #1]
171
172
173
174
175
176
177
178    vld1.u32    {q0},[r12],r11              @vector load pu1_src
179    mov         r10,r5                      @2wd
180    mul         r14,r14,r10
181    vld1.u32    {q1},[r12],r11              @vector load pu1_src
182    pld         [r4, r2, lsl #1]
183    mov         r9,#10
184    vld1.u32    {q2},[r12],r11              @vector load pu1_src
185    rsb         r6,r3,#8
186    sub         r8,r3,#8
187    vld1.u32    {q3},[r12],r9               @vector load pu1_src
188
189
190    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
191    vld1.u32    {q4},[r4],r11               @vector load pu1_src
192    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
193    vld1.u32    {q5},[r4],r11               @vector load pu1_src
194    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
195    vld1.u32    {q6},[r4],r11               @vector load pu1_src
196    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
197    vld1.u32    {q7},[r4],r9                @vector load pu1_src
198    vmull.u8    q14,d3,d25
199    lsl         r6,#1
200    rsb         r3,r5,r3,lsl #1
201    vmlsl.u8    q14,d1,d24
202    lsl         r8,#1
203    rsb         r7,r5,r2,lsl #1
204    vmlal.u8    q14,d5,d26
205
206    vmlsl.u8    q14,d7,d27
207    cmp         r14,#32
208    beq         epilog_end
209    sub         r14,#64
210
211inner_loop_16:
212
213    @ and           r7, r12, #31                    @decrement the wd loop
214    @ cmp           r7, r0
215    pld         [r12, r2, lsl #2]
216    pld         [r4, r2, lsl #2]
217
218
219    subs        r10,r10,#16
220
221    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
222
223
224
225@    addeq      r12,r12,r2,lsl #1
226@    subeq      r12,r12,r5
227    addeq       r12,r12,r7
228    addeq       r4,r12,r2
229
230
231    vst1.16     {q15}, [r1]!
232    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
233
234
235
236
237
238    vld1.u32    {q0},[r12],r11              @vector load pu1_src
239    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
240
241
242
243
244    vld1.u32    {q1},[r12],r11              @vector load pu1_src
245    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
246
247
248    vld1.u32    {q2},[r12],r11              @vector load pu1_src
249    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
250
251    vst1.16     {q14}, [r1],r8
252    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
253
254    vld1.u32    {q3},[r12],r9               @vector load pu1_src
255    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
256
257    vld1.u32    {q4},[r4],r11               @vector load pu1_src
258    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
259
260
261    vld1.u32    {q5},[r4],r11               @vector load pu1_src
262    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
263
264    vld1.u32    {q6},[r4],r11               @vector load pu1_src
265    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
266
267    vld1.u32    {q7},[r4],r9                @vector load pu1_src
268    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
269
270    vst1.16     {q11},[r1]!                 @store the result pu1_dst
271    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
272
273    moveq       r10,r5                      @2wd
274    vmull.u8    q14,d3,d25
275
276
277
278    vmlsl.u8    q14,d1,d24
279    vst1.16     {q10},[r1],r6               @store the result pu1_dst
280
281
282    addeq       r1,r1,r3,lsl #1
283    vmlal.u8    q14,d5,d26
284
285    subs        r14,r14,#32                 @decrement the ht loop
286    vmlsl.u8    q14,d7,d27
287
288
289
290@    mov            r0, r7
291    bgt         inner_loop_16
292
293
294
295    add         r14,r14,#64
296    cmp         r14,#32
297    beq         epilog_end
298
299epilog:
300
301    vst1.16     {q15}, [r1]!
302    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
303    vst1.16     {q14}, [r1],r8
304
305
306
307    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
308    subs        r10,r10,#16                 @decrement the wd loop
309    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
310@    addeq      r12,r12,r2,lsl #1
311    addeq       r12,r12,r7
312    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
313    @ subeq     r12,r12,r5
314    moveq       r10,r5                      @2wd
315    addeq       r4,r12,r2
316    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
317    vld1.u32    {q0},[r12],r11              @vector load pu1_src
318    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
319    vld1.u32    {q1},[r12],r11              @vector load pu1_src
320    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
321    vld1.u32    {q2},[r12],r11              @vector load pu1_src
322    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
323    vld1.u32    {q3},[r12],r9               @vector load pu1_src
324    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
325
326
327    vld1.u32    {q4},[r4],r11               @vector load pu1_src
328    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
329    vld1.u32    {q5},[r4],r11               @vector load pu1_src
330    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
331
332    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
333
334    vld1.u32    {q6},[r4],r11               @vector load pu1_src
335    vmull.u8    q14,d3,d25
336    vld1.u32    {q7},[r4],r9                @vector load pu1_src
337    vmlsl.u8    q14,d1,d24
338    vst1.16     {q11},[r1]!                 @store the result pu1_dst
339    vmlal.u8    q14,d5,d26
340    vst1.16     {q10},[r1],r6               @store the result pu1_dst
341    vmlsl.u8    q14,d7,d27
342    addeq       r1,r1,r3,lsl #1
343
344
345epilog_end:
346
347    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
348    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
349    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
350    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
351
352
353    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
354    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
355    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
356    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
357
358
359    vst1.16     {q15}, [r1]!
360    vst1.16     {q14}, [r1],r8
361    vst1.16     {q11},[r1]!                 @store the result pu1_dst
362    vst1.16     {q10},[r1],r6               @store the result pu1_dst
363
364
365    ldr         r6,[sp,#44]                 @loads ht
366
367    and         r7,r6,#1
368
369    cmp         r7,#0
370    mov         r10,r5
371    addne       r12,r12,r2,lsl #1
372    subne       r12,r12,r5
373    addne       r1,r1,r3,lsl #1
374
375
376    bgt         loop_residue_4
377
378    b           end_loops
379
380
381
382
383outer_loop_8:
384
385    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
386    mov         r10,r5                      @2wd
387    add         r4,r12,r2                   @pu1_src + src_strd
388
389inner_loop_8:
390    @vld1.u32  {d0,d1},[r12],r11               @vector load pu1_src
391    vld1.u32    {d0},[r12],r11              @vector load pu1_src
392    vld1.u32    {d1},[r12],r11              @vector load pu1_src
393    vld1.u32    {d2},[r12],r11              @vector load pu1_src
394    vld1.u32    {d3},[r12],r11              @vector load pu1_src
395
396
397    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
398    vmull.u8    q4,d1,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
399    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
400    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
401    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
402    vmlal.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
403    vmlsl.u8    q4,d3,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
404
405    @vld1.u32  {d12,d13},[r4],r11              @vector load pu1_src + src_strd
406    vld1.u32    {d4},[r4],r11               @vector load pu1_src
407    vld1.u32    {d5},[r4],r11               @vector load pu1_src
408    vld1.u32    {d6},[r4],r11               @vector load pu1_src
409    vld1.u32    {d7},[r4],r11               @vector load pu1_src
410    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
411    vmull.u8    q5,d5,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
412    vmlsl.u8    q5,d4,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
413    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
414    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
415    vmlal.u8    q5,d6,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
416    vmlsl.u8    q5,d7,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
417
418    vst1.16     {d8, d9}, [r1]!
419
420    subs        r10,r10,#8                  @decrement the wd loop
421    vst1.16     {d10, d11},[r6]!            @store the result pu1_dst
422    bgt         inner_loop_8
423
424    sub         r12,r12,r5
425    subs        r14,r14,#2                  @decrement the ht loop
426    sub         r1,r1,r5,lsl #1
427    add         r12,r12,r2,lsl #1
428    add         r1,r1,r3,lsl #2
429    bgt         outer_loop_8
430
431    cmp         r7,#0
432    mov         r10,r5
433    bgt         loop_residue_4
434
435    b           end_loops
436
437
438
439@height if 4 comes
440outer_loop_ht_4:
441
442    mov         r10,r5
443
444prologue_ht_4:
445    mov         r8,r3,lsl #1
446
447inner_loop_ht_4:
448
449    mov         r12,r9
450    mov         r4,r1
451
452    sub         r0, r2, #6                  @ not sure if r0 needs to be preserved
453
454    vld1.u32    {d0},[r12],r11              @(1)vector load pu1_src
455    vld1.u32    {d1},[r12],r11              @(1)vector load pu1_src
456    vld1.u32    {d2},[r12],r11              @(1)vector load pu1_src
457    vld1.u32    {d3},[r12],r0               @(1)vector load pu1_src
458
459    vld1.u32    {d4},[r12],r11              @(2)vector load pu1_src
460    vld1.u32    {d5},[r12],r11              @(2)vector load pu1_src
461    vld1.u32    {d6},[r12],r11              @(2)vector load pu1_src
462    vld1.u32    {d7},[r12],r0               @(2)vector load pu1_src
463
464    vld1.u32    {d14},[r12],r11             @(3)vector load pu1_src
465    vmull.u8    q4,d1,d25                   @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
466
467    vld1.u32    {d15},[r12],r11             @(3)vector load pu1_src
468    vmlsl.u8    q4,d0,d24                   @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
469
470    vld1.u32    {d16},[r12],r11             @(3)vector load pu1_src
471    vmlal.u8    q4,d2,d26                   @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
472
473    vld1.u32    {d17},[r12],r0              @(3)vector load pu1_src
474    vmlsl.u8    q4,d3,d27                   @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
475
476    vld1.u32    {d18},[r12],r11             @(4)vector load pu1_src
477    vmull.u8    q5,d5,d25                   @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
478
479    vld1.u32    {d19},[r12],r11             @(4)vector load pu1_src
480    vmlsl.u8    q5,d4,d24                   @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
481
482    vld1.u32    {d20},[r12],r11             @(4)vector load pu1_src
483    vmlal.u8    q5,d6,d26                   @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
484
485    vld1.u32    {d21},[r12],r2              @(4)vector load pu1_src
486    vmlsl.u8    q5,d7,d27                   @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
487
488    add         r9,r9,#8                    @(core loop)
489
490    subs        r10,r10,#8                  @(prologue)decrement the wd loop
491    beq         epilogue
492
493core_loop:
494    vst1.16     {d8, d9},[r4],r8            @(1)store the result pu1_dst
495    mov         r12,r9
496
497    vld1.u32    {d0},[r12],r11              @(1_1)vector load pu1_src
498    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
499
500    vld1.u32    {d1},[r12],r11              @(1_1)vector load pu1_src
501    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
502
503    vld1.u32    {d2},[r12],r11              @(1_1)vector load pu1_src
504    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
505
506    vld1.u32    {d3},[r12],r0               @(1_1)vector load pu1_src
507    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
508
509    vst1.16     {d10, d11},[r4],r8          @(2)store the result pu1_dst
510    add         r9,r9,#8                    @(core loop)
511
512    vld1.u32    {d4},[r12],r11              @(2_1)vector load pu1_src
513    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
514
515    vld1.u32    {d5},[r12],r11              @(2_1)vector load pu1_src
516    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
517
518    vld1.u32    {d6},[r12],r11              @(2_1)vector load pu1_src
519    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
520
521    vld1.u32    {d7},[r12],r0               @(2_1)vector load pu1_src
522    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
523
524    vst1.16     {d12, d13},[r4],r8          @(3)store the result pu1_dst
525    add         r1,r1,#16                   @(core loop)
526
527    vld1.u32    {d14},[r12],r11             @(3_1)vector load pu1_src
528    vmull.u8    q4,d1,d25                   @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
529
530    vld1.u32    {d15},[r12],r11             @(3_1)vector load pu1_src
531    vmlsl.u8    q4,d0,d24                   @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
532
533    vld1.u32    {d16},[r12],r11             @(3_1)vector load pu1_src
534    vmlal.u8    q4,d2,d26                   @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
535
536    vld1.u32    {d17},[r12],r0              @(3_1)vector load pu1_src
537    vmlsl.u8    q4,d3,d27                   @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
538
539    vst1.16     {d22, d23}, [r4], r8        @(4)store the result pu1_dst
540    subs        r10,r10,#8                  @(core loop)
541
542    vmull.u8    q5,d5,d25                   @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
543    vld1.u32    {d18},[r12],r11             @(4_1)vector load pu1_src
544
545    vld1.u32    {d19},[r12],r11             @(4_1)vector load pu1_src
546    vmlsl.u8    q5,d4,d24                   @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
547
548    vld1.u32    {d20},[r12],r11             @(4_1)vector load pu1_src
549    vmlal.u8    q5,d6,d26                   @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
550
551    mov         r4, r1                      @(core loop)
552
553    vld1.u32    {d21},[r12],r0              @(4_1)vector load pu1_src
554    vmlsl.u8    q5,d7,d27                   @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
555
556
557
558    bgt         core_loop                   @loopback
559
560epilogue:
561    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
562
563    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
564
565    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
566
567    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
568
569    vst1.16     {d8, d9},[r4], r8           @(1)store the result pu1_dst
570
571    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
572    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
573
574    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
575
576    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
577
578    vst1.16     {d10, d11},[r4], r8         @(2)store the result pu1_dst
579
580    vst1.16     {d12, d13},[r4], r8         @(3)store the result pu1_dst
581
582    add         r1,r1,#16                   @(core loop)
583
584    vst1.16     {d22, d23},[r4], r8         @(4)store the result pu1_dst
585
586    sub         r9,r9,r5
587    subs        r14,r14,#4                  @decrement the ht loop
588    sub         r1,r1,r5,lsl #1
589    add         r9,r9,r2,lsl #2
590    add         r1,r1,r3,lsl #3
591    bgt         outer_loop_ht_4
592
593    cmp         r7,#0
594    mov         r10,r5
595    movgt       r12,r9
596    movgt       r4,r1
597    bgt         loop_residue_4
598
599    b           end_loops
600
601outer_loop_4:
602    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
603    mov         r10,r5
604    add         r4,r12,r2                   @pu1_src + src_strd
605
606inner_loop_4:
607    @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
608    vld1.u32    {d0},[r12],r11              @vector load pu1_src
609    vld1.u32    {d1},[r12],r11              @vector load pu1_src
610    vld1.u32    {d2},[r12],r11              @vector load pu1_src
611    vld1.u32    {d3},[r12]                  @vector load pu1_src
612
613@**** removal
614    @add       r12,r12,#4                      @increment the input pointer
615@**** removal ends
616@**** addn
617    sub         r12,r12,#2                  @increment the input pointer
618@**** addn ends
619    vld1.u32    {d4},[r4],r11               @vector load pu1_src
620    vld1.u32    {d5},[r4],r11               @vector load pu1_src
621    vld1.u32    {d6},[r4],r11               @vector load pu1_src
622    vld1.u32    {d7},[r4]                   @vector load pu1_src
623    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
624    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
625    @vld1.u32  {d12,d13},[r4]                  @vector load pu1_src + src_strd
626    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
627
628    @add       r4,r4,#4                        @increment the input pointer
629    sub         r4,r4,#2
630    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
631    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
632    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
633
634@**** removal
635    @vzip.32   d0,d12                          @vector zip the i iteration and ii interation in single register
636    @vzip.32   d2,d14
637    @vzip.32   d4,d16
638    @vzip.32   d6,d18
639@**** removal ends
640@**** addn
641    vzip.32     d0,d4                       @vector zip the i iteration and ii interation in single register
642    vzip.32     d1,d5
643    vzip.32     d2,d6
644    vzip.32     d3,d7
645@**** addn ends
646
647    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
648    vmlsl.u8    q4,d0,d24
649    vmlal.u8    q4,d2,d26
650    vmlsl.u8    q4,d3,d27
651
652    vst1.32     {d8},[r1]!                  @store the i iteration result which is in upper part of the register
653    subs        r10,r10,#4                  @decrement the wd by 4
654
655    vst1.32     {d9},[r6]!                  @store the ii iteration result which is in lower part of the register
656
657    bgt         inner_loop_4
658
659    sub         r12,r12,r5
660    subs        r14,r14,#2                  @decrement the ht by 2
661    sub         r1,r1,r5,lsl #1
662    add         r12,r12,r2,lsl #1
663    add         r1,r1,r3,lsl #2
664    bgt         outer_loop_4
665
666    cmp         r7,#0
667    mov         r10,r5
668    beq         end_loops
669
670loop_residue_4:
671
672    mov         r10,r5                      @2wd
673
674loop_residue:
675
676    @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
677    vld1.u32    {d0},[r12],r11              @vector load pu1_src
678    vld1.u32    {d1},[r12],r11              @vector load pu1_src
679    vld1.u32    {d2},[r12],r11              @vector load pu1_src
680    vld1.u32    {d3},[r12]                  @vector load pu1_src
681    @vext.u8       d2,d0,d1,#2             @vector extract of src[0_2]
682    @vmull.u8      q4,d2,d25               @mul_res = vmull_u8(src[0_3], coeffabs_3)@
683    @vmlsl.u8      q4,d0,d24               @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
684    @vext.u8       d4,d0,d1,#4             @vector extract of src[0_4]
685    @add           r12,r12,#4              @pu1_src + 4
686    sub         r12, r12, #2
687    @vext.u8       d6,d0,d1,#6             @vector extract of src[0_6]
688    @vmlal.u8      q4,d4,d26               @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
689    @vmlsl.u8      q4,d6,d27               @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
690    vmull.u8    q4,d1,d25
691    vmlsl.u8    q4,d0,d24
692    vmlal.u8    q4,d2,d26
693    vmlsl.u8    q4,d3,d27
694
695    vst1.64     {d8 },[r1]                  @store the result pu1_dst
696    subs        r10,r10,#4                  @decrement the wd loop
697    add         r1,r1,#8                    @pi2_dst + 8
698
699    bgt         loop_residue                @loop again
700
701    @inner loop ends
702    @add           r8,r3,lsl #1            @2*dst_strd
703    @sub           r8,r8,r5,lsl #1         @2*dst_strd - 2wd
704    @sub           r9,r2,r5                @src_strd - 2wd
705    @subs          r7,r7,#1                @decrement the ht loop
706    @add           r12,r12,r9              @pu1_src + src_strd
707    @add           r1,r1,r8                @pu1_dst + 2*dst_strd
708    @bgt           outer_loop_residue_4    @loop again
709    @b                 end_loops               @jumps to end
710
711end_loops:
712
713    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
714
715
716
717
718
719
720