1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_chroma_mode_19_to_25.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  naveen sr
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    chroma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] nt
61@*  size of tranform block
62@*
63@* @param[in] mode
64@*  type of filtering
65@*
66@* @returns
67@*
68@* @remarks
69@*  none
70@*
71@*******************************************************************************
72@*/
73
74@void ihevc_intra_pred_chroma_mode_19_to_25(uword8* pu1_ref,
75@                               word32 src_strd,
76@                               uword8* pu1_dst,
77@                               word32 dst_strd,
78@                               word32 nt,
79@                               word32 mode)
80@
81@**************variables vs registers*****************************************
82@r0 => *pu1_ref
83@r1 => src_strd
84@r2 => *pu1_dst
85@r3 => dst_strd
86
87@stack contents from #236
88@   nt
89@   mode
90
91.equ    nt_offset,      236
92.equ    mode_offset,    240
93
94.text
95.align 4
96
97
98
99
100.globl ihevc_intra_pred_chroma_mode_19_to_25_a9q
101.extern gai4_ihevc_ang_table
102.extern gai4_ihevc_inv_ang_table
103.extern gau1_ihevc_planar_factor
104
105gai4_ihevc_inv_ang_table_addr:
106.long gai4_ihevc_inv_ang_table - ulbl1 - 8
107
108gau1_ihevc_planar_factor_addr:
109.long gau1_ihevc_planar_factor - ulbl2 - 8
110
111gai4_ihevc_ang_table_addr_1:
112.long gai4_ihevc_ang_table - ulbl3 - 8
113
114gai4_ihevc_ang_table_addr_2:
115.long gai4_ihevc_ang_table - ulbl4 - 8
116
117.type ihevc_intra_pred_chroma_mode_19_to_25_a9q, %function
118
119ihevc_intra_pred_chroma_mode_19_to_25_a9q:
120
121    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
122    vpush       {d8 - d15}
123    sub         sp, sp, #132                @ref_temp[2 * max_cu_size + 2]
124
125    ldr         r4,[sp,#nt_offset]          @loads nt
126    ldr         r7, gai4_ihevc_ang_table_addr_1
127ulbl3:
128    add         r7,r7,pc
129
130    ldr         r5,[sp,#mode_offset]        @mode (19 to 25)
131    ldr         r8, gai4_ihevc_inv_ang_table_addr
132ulbl1:
133    add         r8,r8,pc
134
135    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
136    add         r8, r8, r5, lsl #2          @gai4_ihevc_inv_ang_table
137    sub         r8, r8, #48                 @gai4_ihevc_inv_ang_table[mode - 12]
138
139    ldr         r7, [r7]                    @intra_pred_ang
140
141    ldr         r8, [r8]                    @inv_ang
142    add         r6, sp, r4 , lsl #1         @ref_temp + 2 * nt
143
144    mul         r9, r4, r7                  @nt*intra_pred_ang
145
146    sub         r6, r6, #2                  @ref_temp + 2*nt - 2
147
148    add         r1, r0, r4, lsl #2          @r1 = &src[4nt]
149    vdup.8      d30, r7                     @intra_pred_ang
150
151    mov         r7, r4
152
153    asr         r9, r9, #5
154
155    vld1.32     d0,[r1]!                    @ pu1_ref[two_nt + k]
156
157    vst1.32     d0,[r6]!                    @ref_temp[k + nt - 1] = pu1_ref[two_nt + k]@
158
159    subs        r7, r7, #4
160    beq         end_loop_copy
161    subs        r7,r7,#4
162    beq         loop_copy_8
163    subs        r7,r7,#8
164    beq         loop_copy_16
165
166loop_copy_32:
167    vld1.8      {d0,d1,d2,d3},[r1]!
168    vld1.8      {d4,d5,d6},[r1]!
169
170    vst1.8      {d0,d1,d2,d3},[r6]!
171
172
173    vst1.8      {d4,d5,d6},[r6]!
174    b           end_loop_copy
175
176loop_copy_16:
177    vld1.8      {d0,d1,d2},[r1]!
178    vst1.8      {d0,d1,d2},[r6]!
179
180    b           end_loop_copy
181
182loop_copy_8:
183    vld1.8      d0,[r1]!
184    vst1.8      d0,[r6]!
185
186end_loop_copy:
187
188    ldrh        r11, [r1]
189    strh        r11, [r6]
190
191    cmp         r9, #-1
192    bge         linear_filtering
193
194    add         r6, sp, r4 ,lsl #1          @ref_temp + 2 * nt
195    sub         r6, r6, #4                  @ref_temp + 2 * nt - 2 - 2
196
197    mov         r12, #0xffffffff
198
199    rsb         r9, r9, r12                 @count to take care off ref_idx
200
201    add         r1, r0, r4, lsl #2          @r1 = &src[2nt]
202
203    mov         r7, #128                    @inv_ang_sum
204
205loop_copy_ref_idx:
206
207    add         r7, r7, r8                  @inv_ang_sum += inv_ang
208    mov         r0,r7, lsr #8
209    mov         r0,r0, lsl #1
210    ldrh        r11, [r1, -r0]
211    strh        r11, [r6], #-2
212
213    subs        r9, r9, #1
214
215    bne         loop_copy_ref_idx
216
217
218linear_filtering:
219@   after copy
220@   below code is taken from mode 27 to 33 and modified
221
222    ldr         r6,gai4_ihevc_ang_table_addr_2 @loads word32 gai4_ihevc_ang_table[35]
223ulbl4:
224    add         r6,r6,pc
225
226    lsl         r7,r4,#2                    @four_nt
227
228    add         r8,r6,r5,lsl #2             @*gai4_ihevc_ang_table[mode]
229    ldr         r9,[r8]                     @intra_pred_ang = gai4_ihevc_ang_table[mode]
230    ldr         r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
231ulbl2:
232    add         r1,r1,pc
233    add         r6,r1,#1
234
235    add         r8, sp, r4, lsl #1          @ref_temp + 2 * nt
236    sub         r8,#2                       @ref_temp + 2*nt -2
237
238    mov         lr,#0                       @row
239    mov         r12,r4
240    lsl         r4,r4,#1
241
242core_loop_8:
243    add         r8,r8,#2                    @pu1_ref_main_idx += (four_nt + 1)
244    vdup.8      d0,r9                       @intra_pred_ang
245    mov         r12,r4,lsr #4               @divide by 8
246
247    vmov.i8     d1,#32
248    mul         r7,r4,r12
249
250    vmov.i16    q3,#31
251
252
253    mov         r1,r8
254
255    mov         r5,r4
256    mov         r11,#2
257
258prologue:
259    vld1.8      {d3},[r6]                   @loads the row value
260    vmull.s8    q1,d3,d0                    @pos = ((row + 1) * intra_pred_ang)
261    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
262    vmovn.i16   d4,q2
263    vshrn.s16   d5,q1,#5                    @idx = pos >> 5
264    vshl.s8     d5,d5,#1
265
266    vdup.8      d31,d4[0]
267    add         r0,r2,r3
268
269    vmov.u32    lr,d5[0]                    @(i row)extract idx to the r register
270@   lsl         lr,lr,#1
271
272    vdup.8      d29,d4[1]                   @(ii)
273    sbfx        r9,lr,#0,#8
274
275    add         r10,r8,r9                   @(i row)*pu1_ref[ref_main_idx]
276
277    vld1.8      {d8},[r10],r11              @(i row)ref_main_idx
278    sbfx        r9,lr,#8,#8
279
280    vld1.8      {d9},[r10]                  @(i row)ref_main_idx_1
281    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
282
283    sbfx        r9,lr,#16,#8
284    vsub.u8     d30,d1,d31                  @32-fract(dup_const_32_fract)
285    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
286
287    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
288    vmull.u8    q5,d8,d30                   @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
289
290    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
291    vmlal.u8    q5,d9,d31                   @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
292
293    vdup.8      d27,d4[2]                   @(iii)
294    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
295    sbfx        r9,lr,#24,#8
296
297    vdup.8      d25,d4[3]                   @(iv)
298    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
299    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
300
301    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
302    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
303
304    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
305    vrshrn.i16  d10,q5,#5                   @(i row)shift_res = vrshrn_n_u16(add_res, 5)
306
307    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
308    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
309
310    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
311
312    vdup.8      d31,d4[4]                   @(v)
313    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
314
315    vmov.u32    lr,d5[1]                    @extract idx to the r register
316    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
317@   lsl         lr,lr,#1
318
319    vst1.8      {d10},[r2]!                 @(i row)
320    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
321
322    sbfx        r9,lr,#0,#8
323    vdup.8      d29,d4[5]                   @(vi)
324    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
325
326    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
327    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
328
329    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
330    sbfx        r9,lr,#8,#8
331
332    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
333    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
334
335    vst1.8      {d14},[r0],r3               @(ii)
336    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
337
338    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
339    vdup.8      d27,d4[6]                   @(vii)
340
341    sbfx        r9,lr,#16,#8
342    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
343    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
344
345    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
346    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
347
348    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
349    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
350
351    vst1.8      {d18},[r0],r3               @(iii)
352    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
353
354    vdup.8      d25,d4[7]                   @(viii)
355    sbfx        r9,lr,#24,#8
356
357    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
358    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
359
360    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
361    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
362
363    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
364    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
365    subs        r7,r7,#8
366
367    vst1.8      {d22},[r0],r3               @(iv)
368    cmp         r4,#8                       @ go to end if 4x4
369    beq         end_loops
370
371    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
372
373    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
374    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
375
376    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
377    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
378
379    addgt       r8,r8,#8
380    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
381    subgt       r4,r4,#8
382
383    vst1.8      {d10},[r0],r3               @(v)
384    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
385
386    beq         epilogue
387
388    vld1.8      {d5},[r6]                   @loads the row value
389    vmull.s8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
390    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
391    vmovn.i16   d4,q2
392    vshrn.s16   d3,q1,#5                    @idx = pos >> 5
393    vshl.s8     d3,d3,#1
394    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
395@   lsl         lr,lr,#1
396    sbfx        r9,lr,#0,#8
397    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
398
399kernel_8_rows:
400    vdup.8      d31,d4[0]
401    subs        r4,r4,#8
402    sbfx        r9,lr,#8,#8
403
404    vld1.8      {d8},[r10],r11              @(i)ref_main_idx
405    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
406
407    addle       r6,r6,#8                    @increment the row value
408    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
409
410    vld1.8      {d9},[r10]                  @(i)ref_main_idx_1
411    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
412
413    vld1.8      {d5},[r6]                   @loads the row value
414    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
415
416    vdup.8      d29,d4[1]                   @(ii)
417    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
418
419    sbfx        r9,lr,#16,#8
420
421    vst1.8      {d14},[r0],r3               @(vi)
422    vsub.u8     d30,d1,d31                  @(i)32-fract(dup_const_32_fract)
423
424    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
425
426    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
427    vmull.u8    q5,d8,d30                   @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
428
429    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
430    vmlal.u8    q5,d9,d31                   @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
431
432    sbfx        r9,lr,#24,#8
433    movle       r4,r5                       @reload nt
434
435    vmov.u32    lr,d3[1]                    @extract idx to the r register
436    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
437
438    vdup.8      d27,d4[2]                   @(iii)
439    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
440    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
441
442    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
443    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
444
445    vst1.8      {d18},[r0],r3               @(vii)
446    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
447
448    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
449    vrshrn.i16  d10,q5,#5                   @(i)shift_res = vrshrn_n_u16(add_res, 5)
450
451    vdup.8      d25,d4[3]                   @(iv)
452    vmull.s8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
453
454    vst1.8      {d22},[r0]                  @(viii)
455    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
456
457    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
458    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
459@   lsl         lr,lr,#1
460
461    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
462    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
463
464    sbfx        r9,lr,#0,#8
465    add         r0,r2,r3
466
467    vdup.8      d31,d4[4]                   @(v)
468    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
469
470    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
471    sbfx        r9,lr,#8,#8
472
473    vst1.8      {d10},[r2]!                 @(i)
474    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
475
476    vdup.8      d29,d4[5]                   @(vi)
477    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
478
479    vdup.8      d27,d4[6]                   @(vii)
480    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
481
482    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
483    sbfx        r9,lr,#16,#8
484
485    vdup.8      d25,d4[7]                   @(viii)
486    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
487
488    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
489    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
490
491    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
492    vshrn.s16   d3,q1,#5                    @idx = pos >> 5
493
494    vst1.8      {d14},[r0],r3               @(ii)
495    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
496
497    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
498    sbfx        r9,lr,#24,#8
499
500    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
501    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
502
503    vshl.s8     d3,d3,#1
504
505    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
506    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
507
508    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
509    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
510
511    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
512    movle       r8,r1                       @reload the source to pu1_src+2nt
513
514    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
515    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
516
517    vst1.8      {d18},[r0],r3               @(iii)
518    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
519
520    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
521    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
522
523    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
524    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
525
526    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
527    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
528
529    addgt       r8,r8,#8                    @increment the source next set 8 columns in same row
530    lslle       r12,r3,#3
531    suble       r12,r12,r5
532
533    vst1.8      {d22},[r0],r3               @(iv)
534    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
535
536    vst1.8      {d10},[r0],r3               @(v)
537    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
538
539    addle       r2,r2,r12                   @increment the dst pointer to 8*dst_strd - nt
540    sbfx        r9,lr,#0,#8
541
542    vmovn.i16   d4,q2
543    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
544@   lsl         lr,lr,#1
545
546    subs        r7,r7,#8
547    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
548
549    bne         kernel_8_rows
550
551epilogue:
552    vst1.8      {d14},[r0],r3               @(vi)
553    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
554
555    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
556    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
557    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
558
559    vst1.8      {d18},[r0],r3               @(vii)
560    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
561
562    vst1.8      {d22},[r0],r3               @(viii)
563    b           end_loops
564
565core_loop_4:
566
567end_loops:
568    add         sp, sp, #132
569    vpop        {d8 - d15}
570    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
571
572
573
574
575
576
577