1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_luma_mode_19_to_25.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  naveen sr
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] nt
61@*  size of tranform block
62@*
63@* @param[in] mode
64@*  type of filtering
65@*
66@* @returns
67@*
68@* @remarks
69@*  none
70@*
71@*******************************************************************************
72@*/
73
74@void ihevc_intra_pred_luma_mode_19_to_25(uword8* pu1_ref,
75@                               word32 src_strd,
76@                               uword8* pu1_dst,
77@                               word32 dst_strd,
78@                               word32 nt,
79@                               word32 mode)
80@
81@**************variables vs registers*****************************************
82@r0 => *pu1_ref
83@r1 => src_strd
84@r2 => *pu1_dst
85@r3 => dst_strd
86
87@stack contents from #40
88@   nt
89@   mode
90
91.text
92.align 4
93
94
95
96
97.globl ihevc_intra_pred_luma_mode_19_to_25_a9q
98.extern gai4_ihevc_ang_table
99.extern gai4_ihevc_inv_ang_table
100.extern gau1_ihevc_planar_factor
101
102gai4_ihevc_inv_ang_table_addr:
103.long gai4_ihevc_inv_ang_table - ulbl1 - 8
104
105gau1_ihevc_planar_factor_addr:
106.long gau1_ihevc_planar_factor - ulbl2 - 8
107
108gai4_ihevc_ang_table_addr_1:
109.long gai4_ihevc_ang_table - ulbl_1 - 8
110
111gai4_ihevc_ang_table_addr_2:
112.long gai4_ihevc_ang_table - ulbl_2 - 8
113
114.type ihevc_intra_pred_luma_mode_19_to_25_a9q, %function
115
116ihevc_intra_pred_luma_mode_19_to_25_a9q:
117
118    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
119
120    ldr         r4,[sp,#40]                 @loads nt
121    ldr         r7, gai4_ihevc_ang_table_addr_1
122ulbl_1:
123    add         r7,r7,pc
124
125    ldr         r5,[sp,#44]                 @mode (19 to 25)
126    ldr         r8, gai4_ihevc_inv_ang_table_addr
127ulbl1:
128    add         r8,r8,pc
129
130    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
131    add         r8, r8, r5, lsl #2          @gai4_ihevc_inv_ang_table
132    sub         r8, r8, #48                 @gai4_ihevc_inv_ang_table[mode - 12]
133
134    ldr         r7, [r7]                    @intra_pred_ang
135    sub         sp, sp, #132                @ref_temp[2 * max_cu_size + 1]
136
137    ldr         r8, [r8]                    @inv_ang
138    add         r6, sp, r4                  @ref_temp + nt
139
140    mul         r9, r4, r7                  @nt*intra_pred_ang
141
142    sub         r6, r6, #1                  @ref_temp + nt - 1
143
144    add         r1, r0, r4, lsl #1          @r1 = &src[2nt]
145    vdup.8      d30, r7                     @intra_pred_ang
146
147    mov         r7, r4
148
149    asr         r9, r9, #5
150
151    vld1.32     d0[0],[r1]!                 @ pu1_ref[two_nt + k]
152
153    vst1.32     d0[0],[r6]!                 @ref_temp[k + nt - 1] = pu1_ref[two_nt + k]@
154
155    subs        r7, r7, #4
156    beq         end_loop_copy
157    sub         r1,#4
158    sub         r6,#4
159    subs        r7,r7,#4
160    beq         loop_copy_8
161    subs        r7,r7,#8
162    beq         loop_copy_16
163
164loop_copy_32:
165    vld1.8      d0,[r1]!
166    vld1.8      d1,[r1]!
167    vld1.8      d2,[r1]!
168    vld1.8      d3,[r1]!
169
170    vst1.8      d0,[r6]!
171    vst1.8      d1,[r6]!
172    vst1.8      d2,[r6]!
173    vst1.8      d3,[r6]!
174    b           end_loop_copy
175
176loop_copy_16:
177    vld1.8      d0,[r1]!
178    vld1.8      d1,[r1]!
179
180    vst1.8      d0,[r6]!
181    vst1.8      d1,[r6]!
182    b           end_loop_copy
183
184loop_copy_8:
185    vld1.8      d0,[r1]!
186    vst1.8      d0,[r6]!
187
188end_loop_copy:
189
190    ldrb        r11, [r1]
191    strb        r11, [r6]
192
193    cmp         r9, #-1
194    bge         linear_filtering
195
196    add         r6, sp, r4                  @ref_temp + nt
197    sub         r6, r6, #2                  @ref_temp + nt - 2
198
199    mov         r12, #0xffffffff
200
201    rsb         r9, r9, r12                 @count to take care off ref_idx
202
203    add         r1, r0, r4, lsl #1          @r1 = &src[2nt]
204
205    mov         r7, #128                    @inv_ang_sum
206
207loop_copy_ref_idx:
208
209    add         r7, r7, r8                  @inv_ang_sum += inv_ang
210    mov         r14,r7,lsr #8
211    ldrb        r11, [r1, -r14]
212@   ldrb        r11, [r1, -r7, lsr #8]
213    strb        r11, [r6], #-1
214
215    subs        r9, r9, #1
216
217    bne         loop_copy_ref_idx
218
219
220linear_filtering:
221@   after copy
222@   below code is taken from mode 27 to 33 and modified
223
224    ldr         r6,gai4_ihevc_ang_table_addr_2 @loads word32 gai4_ihevc_ang_table[35]
225ulbl_2:
226    add         r6,r6,pc
227
228    add         r8,r6,r5,lsl #2             @*gai4_ihevc_ang_table[mode]
229    ldr         r9,[r8]                     @intra_pred_ang = gai4_ihevc_ang_table[mode]
230    ldr         r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
231ulbl2:
232    add         r1,r1,pc
233    add         r6,r1,#1
234
235    add         r8, sp, r4                  @ref_temp + nt
236    sub         r8,#1                       @ref_temp + nt -1
237
238    tst         r4,#7
239    mov         lr,#0                       @row
240    mov         r12,r4
241    bne         core_loop_4
242
243core_loop_8:
244    add         r8,r8,#1                    @pu1_ref_main_idx += (two_nt + 1)
245    vdup.8      d0,r9                       @intra_pred_ang
246    mov         r12,r4,lsr #3               @divide by 8
247
248    vmov.i8     d1,#32
249    mul         r7,r4,r12
250
251    vmov.i16    q3,#31
252    @lsl            r12,r3,#3
253
254    mov         r1,r8
255    @sub            r12,r12,r4
256    mov         r5,r4
257    mov         r11,#1
258
259prologue:
260    vld1.8      {d3},[r6]                   @loads the row value
261    vmull.s8    q1,d3,d0                    @pos = ((row + 1) * intra_pred_ang)
262    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
263    vmovn.i16   d4,q2
264    vshrn.s16   d5,q1,#5                    @idx = pos >> 5
265
266    vdup.8      d31,d4[0]
267    add         r0,r2,r3
268
269    vmov.u32    lr,d5[0]                    @(i row)extract idx to the r register
270
271    vdup.8      d29,d4[1]                   @(ii)
272    sbfx        r9,lr,#0,#8
273
274    add         r10,r8,r9                   @(i row)*pu1_ref[ref_main_idx]
275
276    vld1.8      {d8},[r10],r11              @(i row)ref_main_idx
277    sbfx        r9,lr,#8,#8
278
279    vld1.8      {d9},[r10]                  @(i row)ref_main_idx_1
280    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
281
282    sbfx        r9,lr,#16,#8
283    vsub.u8     d30,d1,d31                  @32-fract(dup_const_32_fract)
284    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
285
286    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
287    vmull.u8    q5,d8,d30                   @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
288
289    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
290    vmlal.u8    q5,d9,d31                   @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
291
292    vdup.8      d27,d4[2]                   @(iii)
293    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
294    sbfx        r9,lr,#24,#8
295
296    vdup.8      d25,d4[3]                   @(iv)
297    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
298    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
299
300    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
301    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
302
303    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
304    vrshrn.i16  d10,q5,#5                   @(i row)shift_res = vrshrn_n_u16(add_res, 5)
305
306    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
307    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
308
309    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
310
311    vdup.8      d31,d4[4]                   @(v)
312    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
313
314    vmov.u32    lr,d5[1]                    @extract idx to the r register
315    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
316
317    vst1.8      {d10},[r2]!                 @(i row)
318    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
319
320    sbfx        r9,lr,#0,#8
321    vdup.8      d29,d4[5]                   @(vi)
322    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
323
324    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
325    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
326
327    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
328    sbfx        r9,lr,#8,#8
329
330    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
331    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
332
333    vst1.8      {d14},[r0],r3               @(ii)
334    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
335
336    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
337    vdup.8      d27,d4[6]                   @(vii)
338
339    sbfx        r9,lr,#16,#8
340    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
341    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
342
343    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
344    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
345
346    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
347    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
348
349    vst1.8      {d18},[r0],r3               @(iii)
350    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
351
352    vdup.8      d25,d4[7]                   @(viii)
353    sbfx        r9,lr,#24,#8
354
355    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
356    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
357
358    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
359    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
360
361    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
362    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
363    subs        r4,r4,#8
364
365    vst1.8      {d22},[r0],r3               @(iv)
366    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
367
368    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
369    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
370
371    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
372    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
373
374    addgt       r8,r8,#8
375    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
376    subgt       r7,r7,#8
377
378    vst1.8      {d10},[r0],r3               @(v)
379    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
380
381    beq         epilogue
382
383    vld1.8      {d5},[r6]                   @loads the row value
384    vmull.s8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
385    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
386    vmovn.i16   d4,q2
387    vshrn.s16   d3,q1,#5                    @idx = pos >> 5
388    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
389    sbfx        r9,lr,#0,#8
390    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
391
392kernel_8_rows:
393    vdup.8      d31,d4[0]
394    subs        r4,r4,#8
395    sbfx        r9,lr,#8,#8
396
397    vld1.8      {d8},[r10],r11              @(i)ref_main_idx
398    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
399
400    addle       r6,r6,#8                    @increment the row value
401    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
402
403    vld1.8      {d9},[r10]                  @(i)ref_main_idx_1
404    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
405
406    vld1.8      {d5},[r6]                   @loads the row value
407    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
408
409    vdup.8      d29,d4[1]                   @(ii)
410    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
411
412    sbfx        r9,lr,#16,#8
413
414    vst1.8      {d14},[r0],r3               @(vi)
415    vsub.u8     d30,d1,d31                  @(i)32-fract(dup_const_32_fract)
416
417    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
418
419    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
420    vmull.u8    q5,d8,d30                   @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
421
422    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
423    vmlal.u8    q5,d9,d31                   @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
424
425    sbfx        r9,lr,#24,#8
426    movle       r4,r5                       @reload nt
427
428    vmov.u32    lr,d3[1]                    @extract idx to the r register
429    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
430
431    vdup.8      d27,d4[2]                   @(iii)
432    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
433    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
434
435    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
436    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
437
438    vst1.8      {d18},[r0],r3               @(vii)
439    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
440
441    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
442    vrshrn.i16  d10,q5,#5                   @(i)shift_res = vrshrn_n_u16(add_res, 5)
443
444    vdup.8      d25,d4[3]                   @(iv)
445    vmull.s8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
446
447    vst1.8      {d22},[r0]                  @(viii)
448    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
449
450    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
451    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
452
453    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
454    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
455
456    sbfx        r9,lr,#0,#8
457    add         r0,r2,r3
458
459    vdup.8      d31,d4[4]                   @(v)
460    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
461
462    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
463    sbfx        r9,lr,#8,#8
464
465    vst1.8      {d10},[r2]!                 @(i)
466    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
467
468    vdup.8      d29,d4[5]                   @(vi)
469    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
470
471    vdup.8      d27,d4[6]                   @(vii)
472    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
473
474    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
475    sbfx        r9,lr,#16,#8
476
477    vdup.8      d25,d4[7]                   @(viii)
478    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
479
480    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
481    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
482
483    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
484    vshrn.s16   d3,q1,#5                    @idx = pos >> 5
485
486    vst1.8      {d14},[r0],r3               @(ii)
487    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
488
489    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
490    sbfx        r9,lr,#24,#8
491
492    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
493    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
494
495    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
496    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
497
498    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
499    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
500
501    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
502    movle       r8,r1                       @reload the source to pu1_src+2nt
503
504    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
505    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
506
507    vst1.8      {d18},[r0],r3               @(iii)
508    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
509
510    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
511    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
512
513    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
514    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
515
516    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
517    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
518
519    addgt       r8,r8,#8                    @increment the source next set 8 columns in same row
520    lslle       r12,r3,#3
521    suble       r12,r12,r5
522
523    vst1.8      {d22},[r0],r3               @(iv)
524    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
525
526    vst1.8      {d10},[r0],r3               @(v)
527    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
528
529    addle       r2,r2,r12                   @increment the dst pointer to 8*dst_strd - nt
530    sbfx        r9,lr,#0,#8
531
532    vmovn.i16   d4,q2
533    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
534
535    subs        r7,r7,#8
536    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
537
538    bne         kernel_8_rows
539
540epilogue:
541    vst1.8      {d14},[r0],r3               @(vi)
542    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
543
544    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
545    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
546    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
547
548    vst1.8      {d18},[r0],r3               @(vii)
549    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
550
551    vst1.8      {d22},[r0],r3               @(viii)
552    b           end_loops
553
554core_loop_4:
555    add         r6,r8,#1                    @pu1_ref_main_idx +=  1
556    mov         r8,#0
557
558    add         r5,r8,#1                    @row + 1
559    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
560    mov         lr,r5,asr #5                @if(fract_prev > fract)
561    and         r5,r5,#31                   @fract = pos & (31)
562    add         r10,r6,lr                   @pu1_ref_main_idx += 1
563    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
564    vdup.8      d0,r5                       @dup_const_fract
565    rsb         r4,r5,#32
566    vdup.8      d1,r4                       @dup_const_32_fract
567
568@inner_loop_4
569    vld1.32     {d2[0]},[r10]               @ref_main_idx
570    add         r8,r8,#1
571@   mov         lr,r5                           @fract_prev = fract
572
573    vld1.32     {d3[0]},[r11]               @ref_main_idx_1
574    add         r5,r8,#1                    @row + 1
575    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
576    mov         lr,r5,asr #5                @ pos >> 5
577    and         r5,r5,#31                   @fract = pos & (31)
578    add         r10,r6,lr                   @pu1_ref_main_idx += 1
579    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
580
581    vdup.8      d6,r5                       @dup_const_fract
582    vmull.u8    q2,d2,d1                    @vmull_u8(ref_main_idx, dup_const_32_fract)
583
584    rsb         r4,r5,#32
585    vdup.8      d7,r4                       @dup_const_32_fract
586    vmlal.u8    q2,d3,d0                    @vmull_u8(ref_main_idx_1, dup_const_fract)
587
588    vld1.32     {d8[0]},[r10]               @ref_main_idx
589    add         r8,r8,#1
590
591    vld1.32     {d9[0]},[r11]               @ref_main_idx_1
592    vrshrn.i16  d4,q2,#5                    @shift_res = vrshrn_n_u16(add_res, 5)
593
594@   mov         lr,r5                           @fract_prev = fract
595    add         r5,r8,#1                    @row + 1
596    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
597    mov         lr,r5,asr #5                @if(fract_prev > fract)
598    and         r5,r5,#31                   @fract = pos & (31)
599    add         r10,r6,lr                   @ref_main + idx
600    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
601
602    vdup.8      d12,r5                      @dup_const_fract
603    vmull.u8    q5,d8,d7                    @vmull_u8(ref_main_idx, dup_const_32_fract)
604
605    rsb         r4,r5,#32
606    vdup.8      d13,r4                      @dup_const_32_fract
607    vmlal.u8    q5,d9,d6                    @vmull_u8(ref_main_idx_1, dup_const_fract)
608
609    vld1.32     {d14[0]},[r10]              @ref_main_idx
610    add         r8,r8,#1
611
612    vst1.32     {d4[0]},[r2],r3
613    vrshrn.i16  d10,q5,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
614
615    vld1.32     {d15[0]},[r11]              @ref_main_idx_1
616@   mov         lr,r5                           @fract_prev = fract
617    add         r5,r8,#1                    @row + 1
618    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
619    mov         lr,r5,asr #5                @if(fract_prev > fract)
620    and         r5,r5,#31                   @fract = pos & (31)
621    add         r10,r6,lr                   @pu1_ref_main_idx += 1
622    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
623
624    vdup.8      d18,r5                      @dup_const_fract
625    vmull.u8    q8,d14,d13                  @vmull_u8(ref_main_idx, dup_const_32_fract)
626
627    rsb         r4,r5,#32
628    vdup.8      d19,r4                      @dup_const_32_fract
629    vmlal.u8    q8,d15,d12                  @vmull_u8(ref_main_idx_1, dup_const_fract)
630
631    vld1.32     {d20[0]},[r10]              @ref_main_idx
632
633    vst1.32     {d10[0]},[r2],r3
634    vrshrn.i16  d16,q8,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
635    vld1.32     {d21[0]},[r11]              @ref_main_idx_1
636
637    vmull.u8    q11,d20,d19                 @vmull_u8(ref_main_idx, dup_const_32_fract)
638    vmlal.u8    q11,d21,d18                 @vmull_u8(ref_main_idx_1, dup_const_fract)
639
640    vst1.32     {d16[0]},[r2],r3
641    vrshrn.i16  d22,q11,#5                  @shift_res = vrshrn_n_u16(add_res, 5)
642
643    vst1.32     {d22[0]},[r2],r3
644
645end_loops:
646    add         sp, sp, #132
647    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
648
649
650
651
652
653
654