1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_luma_mode_11_to_17.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  akshaya mukund
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] nt
61@*  size of tranform block
62@*
63@* @param[in] mode
64@*  type of filtering
65@*
66@* @returns
67@*
68@* @remarks
69@*  none
70@*
71@*******************************************************************************
72@*/
73
74@void ihevc_intra_pred_luma_mode_11_to_17(uword8* pu1_ref,
75@                               word32 src_strd,
76@                               uword8* pu1_dst,
77@                               word32 dst_strd,
78@                               word32 nt,
79@                               word32 mode)
80@
81@**************variables vs registers*****************************************
82@r0 => *pu1_ref
83@r1 => src_strd
84@r2 => *pu1_dst
85@r3 => dst_strd
86
87@stack contents from #40
88@   nt
89@   mode
90
91.text
92.align 4
93
94
95
96
97.globl ihevc_intra_pred_luma_mode_11_to_17_a9q
98.extern gai4_ihevc_ang_table
99.extern gai4_ihevc_inv_ang_table
100.extern col_for_intra_luma
101.extern idx_neg_idx_11_17
102
103gai4_ihevc_ang_table_addr:
104.long gai4_ihevc_ang_table - ulbl1 - 8
105
106gai4_ihevc_inv_ang_table_addr:
107.long gai4_ihevc_inv_ang_table - ulbl2 - 8
108
109idx_neg_idx_11_17_addr_1:
110.long idx_neg_idx_11_17 - ulbl3 - 8
111
112idx_neg_idx_11_17_addr_2:
113.long idx_neg_idx_11_17 - ulbl4 - 8
114
115col_for_intra_luma_addr_1:
116.long col_for_intra_luma - ulbl_1 - 8
117
118col_for_intra_luma_addr_2:
119.long col_for_intra_luma - ulbl_2 - 8
120
121col_for_intra_luma_addr_3:
122.long col_for_intra_luma - ulbl_3 - 8
123
124col_for_intra_luma_addr_4:
125.long col_for_intra_luma - ulbl_4 - 8
126
127.type ihevc_intra_pred_luma_mode_11_to_17_a9q, %function
128
129ihevc_intra_pred_luma_mode_11_to_17_a9q:
130
131    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
132
133    ldr         r4,[sp,#40]                 @loads nt
134    ldr         r7, gai4_ihevc_ang_table_addr
135ulbl1:
136    add         r7,r7,pc
137
138    ldr         r5,[sp,#44]                 @mode (11 to 17)
139    ldr         r8, gai4_ihevc_inv_ang_table_addr
140ulbl2:
141    add         r8,r8,pc
142
143    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
144    add         r8, r8, r5, lsl #2          @gai4_ihevc_inv_ang_table[mode - 11]
145    sub         r8, r8, #44
146
147    ldr         r7, [r7]                    @intra_pred_ang
148    sub         sp, sp, #132                @ref_temp[2 * max_cu_size + 1]
149
150    ldr         r8, [r8]                    @inv_ang
151    add         r6, sp, r4                  @ref_temp + nt
152
153    mul         r9, r4, r7                  @nt*intra_pred_ang
154
155    sub         r6, r6, #1                  @ref_temp + nt - 1
156
157    add         r1, r0, r4, lsl #1          @r1 = &src[2nt]
158    vdup.8      d30, r7                     @intra_pred_ang
159
160    mov         r7, r4
161
162    ldrb        r11, [r1], #-1
163
164    asr         r9, r9, #5
165
166    ldrb        r12, [r1], #-1
167    ldrb        r10, [r1], #-1
168    ldrb        r14, [r1], #-1
169
170    strb        r11, [r6], #1
171    strb        r12, [r6], #1
172    strb        r10, [r6], #1
173    strb        r14, [r6], #1
174
175    subs        r7, r7, #4
176    beq         end_loop_copy
177
178    sub         r6,#4
179    sub         r1,#3
180
181    subs        r7,r7,#4
182    beq         loop_copy_8
183    subs        r7,r7,#8
184    beq         loop_copy_16
185
186loop_copy_32:
187    vld1.8      d0,[r1]
188    sub         r1,#8
189    vld1.8      d1,[r1]
190    sub         r1,#8
191    vld1.8      d2,[r1]
192    sub         r1,#8
193    vld1.8      d3,[r1]
194
195    vrev64.8    d0,d0
196    vrev64.8    d1,d1
197    vst1.8      d0,[r6]!
198    vrev64.8    d2,d2
199    vst1.8      d1,[r6]!
200    vrev64.8    d3,d3
201    vst1.8      d2,[r6]!
202    vst1.8      d3,[r6]!
203    sub         r1,#1
204    b           end_loop_copy
205
206loop_copy_16:
207    vld1.8      d0,[r1]
208    sub         r1,#8
209    vld1.8      d1,[r1]
210
211    vrev64.8    d0,d0
212    vrev64.8    d1,d1
213
214    vst1.8      d0,[r6]!
215    vst1.8      d1,[r6]!
216    sub         r1,#1
217    b           end_loop_copy
218
219loop_copy_8:
220    vld1.8      d0,[r1]
221    vrev64.8    d0,d0
222    vst1.8      d0,[r6]!
223    sub         r1,#1
224end_loop_copy:
225
226    ldrb        r11, [r1], #-1
227    strb        r11, [r6], #1
228
229    cmp         r9, #-1
230    bge         prologue_8_16_32
231
232    add         r6, sp, r4                  @ref_temp + nt
233    sub         r6, r6, #2                  @ref_temp + nt - 2
234
235    mov         r12, #0xffffffff
236
237    rsb         r9, r9, r12                 @count to take care off ref_idx
238
239    add         r1, r0, r4, lsl #1          @r1 = &src[2nt]
240
241    mov         r7, #128                    @inv_ang_sum
242
243loop_copy_ref_idx:
244
245    add         r7, r7, r8                  @inv_ang_sum += inv_ang
246
247    ldrb        r11, [r1, r7, lsr #8]
248    strb        r11, [r6], #-1
249
250    subs        r9, r9, #1
251
252    bne         loop_copy_ref_idx
253
254prologue_8_16_32:
255    cmp         r4, #4
256    beq         sz_4_proc
257    ldr         r14, col_for_intra_luma_addr_1
258ulbl_1:
259    add         r14,r14,pc
260
261    lsr         r10, r4, #3
262    vld1.8      d31, [r14]!
263    mul         r10, r4, r10                @block counter (dec by #8)
264
265    mov         r11, r4                     @col counter to be inc/dec by #8
266    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
267    mov         r0, #1
268
269    sub         r7, r5, #11
270    vdup.8      d2, r0                      @contains #1 for adding to get ref_main_idx + 1
271    ldr         r12, idx_neg_idx_11_17_addr_1 @load least idx table
272ulbl3:
273    add         r12,r12,pc
274
275    mov         r0, #2
276    vdup.8      d3, r0
277
278    add         r12, r12, r7, lsl #4
279    mov         r8, r12
280
281    mov         r7, #8
282    sub         r7, r7, r3, lsl #3          @r7 = 8-8r3
283
284    ldr         r9, [r8]
285    add         r1, sp, r4                  @ref_temp + nt
286
287    vmovn.s16   d6, q11
288    vdup.8      d26, r9                     @least idx added to final idx values
289    sub         r1, r1, #1                  @ref_temp + nt - 1
290
291    add         r6, r1, r9
292
293    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
294    vshr.s16    q11, q11, #5
295
296    mov         r0, #31
297    vdup.8      d29, r0                     @contains #31 for vand operation
298
299    mov         r0, #32
300    vdup.8      d28, r0
301
302    vqmovn.s16  d8, q11
303
304    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
305
306    mov         r0, #1
307    vdup.8      d27, r0                     @row value inc or reset accordingly
308
309    vadd.s8     d8, d8, d27                 @ref_main_idx (add row)
310    vsub.s8     d8, d8, d26                 @ref_main_idx (row 0)
311    vadd.s8     d9, d8, d2                  @ref_main_idx + 1 (row 0)
312    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
313    vsub.s8     d7, d28, d6                 @32-fract
314
315    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
316    vadd.s8     d4, d8, d2                  @ref_main_idx (row 1)
317    vadd.s8     d5, d9, d2                  @ref_main_idx + 1 (row 1)
318
319    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 1)
320    vmull.u8    q12, d12, d7                @mul (row 0)
321    vmlal.u8    q12, d13, d6                @mul (row 0)
322
323    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
324    vadd.s8     d8, d8, d3                  @ref_main_idx (row 2)
325    vadd.s8     d9, d9, d3                  @ref_main_idx + 1 (row 2)
326
327    vrshrn.i16  d24, q12, #5                @round shft (row 0)
328
329    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 2)
330    vmull.u8    q11, d16, d7                @mul (row 1)
331    vmlal.u8    q11, d17, d6                @mul (row 1)
332
333    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
334    vadd.s8     d4, d4, d3                  @ref_main_idx (row 3)
335    vadd.s8     d5, d5, d3                  @ref_main_idx + 1 (row 3)
336
337    vst1.8      d24, [r2], r3               @st (row 0)
338    vrshrn.i16  d22, q11, #5                @round shft (row 1)
339
340    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 3)
341    vmull.u8    q10, d14, d7                @mul (row 2)
342    vmlal.u8    q10, d15, d6                @mul (row 2)
343
344    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
345    vadd.s8     d8, d8, d3                  @ref_main_idx (row 4)
346    vadd.s8     d9, d9, d3                  @ref_main_idx + 1 (row 4)
347
348    vst1.8      d22, [r2], r3               @st (row 1)
349    vrshrn.i16  d20, q10, #5                @round shft (row 2)
350
351    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 4)
352    vmull.u8    q9, d10, d7                 @mul (row 3)
353    vmlal.u8    q9, d11, d6                 @mul (row 3)
354
355    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 4)
356    vadd.s8     d4, d4, d3                  @ref_main_idx (row 5)
357    vadd.s8     d5, d5, d3                  @ref_main_idx + 1 (row 5)
358
359    vst1.8      d20, [r2], r3               @st (row 2)
360    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
361
362    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 5)
363    vmull.u8    q12, d12, d7                @mul (row 4)
364    vmlal.u8    q12, d13, d6                @mul (row 4)
365
366    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 5)
367    vadd.s8     d8, d8, d3                  @ref_main_idx (row 6)
368    vadd.s8     d9, d9, d3                  @ref_main_idx + 1 (row 6)
369
370    vst1.8      d18, [r2], r3               @st (row 3)
371    vrshrn.i16  d24, q12, #5                @round shft (row 4)
372
373    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 6)
374    vmull.u8    q11, d16, d7                @mul (row 5)
375    vmlal.u8    q11, d17, d6                @mul (row 5)
376
377    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 6)
378    vadd.s8     d4, d4, d3                  @ref_main_idx (row 7)
379    vadd.s8     d5, d5, d3                  @ref_main_idx + 1 (row 7)
380
381    vst1.8      d24, [r2], r3               @st (row 4)
382    vrshrn.i16  d22, q11, #5                @round shft (row 5)
383
384    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
385    vmull.u8    q10, d14, d7                @mul (row 6)
386    vmlal.u8    q10, d15, d6                @mul (row 6)
387
388    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 7)
389    vmull.u8    q9, d10, d7                 @mul (row 7)
390    vmlal.u8    q9, d11, d6                 @mul (row 7)
391
392    vst1.8      d22, [r2], r3               @st (row 5)
393    vrshrn.i16  d20, q10, #5                @round shft (row 6)
394    vrshrn.i16  d18, q9, #5                 @round shft (row 7)
395
396    vst1.8      d20, [r2], r3               @st (row 6)
397
398    subs        r10, r10, #8                @subtract 8 and go to end if 8x8
399
400    vst1.8      d18, [r2], r3               @st (row 7)
401
402    beq         end_func
403
404    subs        r11, r11, #8
405    addgt       r8, r8, #4
406    addgt       r2, r2, r7
407    movle       r8, r12
408    suble       r2, r2, r4
409    addle       r2, r2, #8
410    movle       r11, r4
411    ldrle       r14, col_for_intra_luma_addr_2
412ulbl_2:
413    addle       r14,r14,pc
414    addle       r0, r0, #8
415
416    mov         r5,r2
417    vld1.8      d31, [r14]!
418    vmull.s8    q6, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
419    vmovn.s16   d10, q6
420    vshr.s16    q6, q6, #5
421    vqmovn.s16  d11, q6
422    vdup.8      d27, r0                     @row value inc or reset accordingly
423    ldr         r9, [r8]
424    add         r9, r0, r9
425    sub         r9, r9, #1
426    vdup.8      d26, r9
427    vadd.s8     d8, d27, d11                @ref_main_idx (add row)
428
429    sub         r4,r4,#8
430
431kernel_8_16_32:
432
433    vsub.s8     d8, d8, d26                 @ref_main_idx
434    vmov        d26,d10
435
436    subs        r11, r11, #8
437    add         r6, r1, r9
438    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
439    vadd.s8     d9, d2, d8                  @ref_main_idx + 1
440
441    vmull.u8    q10, d14, d7                @mul (row 6)
442    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 7)
443    vmlal.u8    q10, d15, d6                @mul (row 6)
444
445    addle       r0, r0, #8
446    addgt       r8, r8, #4
447    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
448
449    vst1.8      d24, [r5], r3               @st (row 4)
450    vrshrn.i16  d24, q11, #5                @round shft (row 5)
451
452    ldrle       r14, col_for_intra_luma_addr_3
453ulbl_3:
454    addle       r14,r14,pc
455    movle       r8, r12
456    vdup.8      d27, r0                     @row value inc or reset accordingly
457
458    vadd.s8     d4, d2, d8                  @ref_main_idx (row 1)
459    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
460    vadd.s8     d5, d2, d9                  @ref_main_idx + 1 (row 1)
461
462
463    vmull.u8    q9, d10, d7                 @mul (row 7)
464    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
465    vmlal.u8    q9, d11, d6                 @mul (row 7)
466
467    vld1.8      d31, [r14]!
468    vand        d6, d29, d26                @fract values in d1/ idx values in d0
469
470    vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
471    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
472
473    vadd.s8     d8, d3, d8                  @ref_main_idx (row 2)
474    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 1)
475    vadd.s8     d9, d3, d9                  @ref_main_idx + 1 (row 2)
476
477    addle       r11, r4, #8
478    ldr         r9, [r8]
479    vsub.s8     d7, d28, d6                 @32-fract
480
481    vmull.u8    q12, d12, d7                @mul (row 0)
482    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
483    vmlal.u8    q12, d13, d6                @mul (row 0)
484
485    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
486    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
487
488    vadd.s8     d4, d4, d3                  @ref_main_idx (row 3)
489    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 2)
490    vadd.s8     d5, d5, d3                  @ref_main_idx + 1 (row 3)
491
492    vmull.u8    q11, d16, d7                @mul (row 1)
493    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
494    vmlal.u8    q11, d17, d6                @mul (row 1)
495
496    vrshrn.i16  d24, q12, #5                @round shft (row 0)
497    vst1.8      d18, [r5], r3               @(from previous loop)st (row 7)
498
499    vadd.s8     d8, d8, d3                  @ref_main_idx (row 4)
500    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 3)
501    vadd.s8     d9, d9, d3                  @ref_main_idx + 1 (row 4)
502
503    vmull.u8    q10, d14, d7                @mul (row 2)
504    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
505    vmlal.u8    q10, d15, d6                @mul (row 2)
506
507    vmull.s8    q7, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
508    add         r5,r2,r3,lsl#2
509    add         r9, r0, r9
510
511
512    vst1.8      d24, [r2], r3               @st (row 0)
513    vrshrn.i16  d22, q11, #5                @round shft (row 1)
514
515    vadd.s8     d4, d4, d3                  @ref_main_idx (row 5)
516    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 4)
517    vadd.s8     d5, d5, d3                  @ref_main_idx + 1 (row 5)
518
519    vmull.u8    q9, d10, d7                 @mul (row 3)
520    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 4)
521    vmlal.u8    q9, d11, d6                 @mul (row 3)
522
523    vst1.8      d22, [r2], r3               @st (row 1)
524    vrshrn.i16  d20, q10, #5                @round shft (row 2)
525
526    vmovn.s16   d10, q7
527    vshr.s16    q7, q7, #5
528
529    vadd.s8     d8, d8, d3                  @ref_main_idx (row 6)
530    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 5)
531    vadd.s8     d9, d9, d3                  @ref_main_idx + 1 (row 6)
532
533    vmull.u8    q12, d12, d7                @mul (row 4)
534    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 5)
535    vmlal.u8    q12, d13, d6                @mul (row 4)
536
537    vst1.8      d20, [r2], r3               @st (row 2)
538    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
539
540    sub         r9, r9, #1
541    vqmovn.s16  d11, q7
542
543    vadd.s8     d4, d4, d3                  @ref_main_idx (row 7)
544    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 6)
545    vadd.s8     d5, d5, d3                  @ref_main_idx + 1 (row 7)
546
547    vmull.u8    q11, d16, d7                @mul (row 5)
548    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 6)
549    vmlal.u8    q11, d17, d6                @mul (row 5)
550
551    vadd.s8     d8, d27, d11                @ref_main_idx (add row)
552    vdup.8      d26, r9
553
554    vst1.8      d18, [r2], r3               @st (row 3)
555    vrshrn.i16  d24, q12, #5                @round shft (row 4)
556
557
558    add         r2,r3, lsl #2
559    addgt       r2, r7, r2
560    suble       r2, r2, r4
561
562    subs        r10, r10, #8                @subtract 8 and go to end if 8x8
563
564    bne         kernel_8_16_32
565epil_8_16_32:
566
567    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
568
569    vmull.u8    q10, d14, d7                @mul (row 6)
570    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 7)
571    vmlal.u8    q10, d15, d6                @mul (row 6)
572
573    vst1.8      d24, [r5], r3               @st (row 4)
574    vrshrn.i16  d24, q11, #5                @round shft (row 5)
575
576    vmull.u8    q9, d10, d7                 @mul (row 7)
577    vmlal.u8    q9, d11, d6                 @mul (row 7)
578
579    vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
580    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
581
582    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
583    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
584
585    vst1.8      d18, [r5], r3               @st (row 7)
586
587
588    b           end_func
589
590sz_4_proc:
591    ldr         r14, col_for_intra_luma_addr_4
592ulbl_4:
593    add         r14,r14,pc
594
595    vld1.8      d31, [r14]
596    mov         r12, #1
597
598    vdup.8      d2, r12                     @contains #1 for adding to get ref_main_idx + 1
599    mov         r0, #2
600
601    vdup.8      d3, r0
602    ldr         r12, idx_neg_idx_11_17_addr_2 @load least idx table
603ulbl4:
604    add         r12,r12,pc
605
606    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
607    sub         r7, r5, #11
608
609    add         r12, r12, r7, lsl #4
610    mov         r8, r12
611
612    ldr         r9, [r8]
613
614    vdup.8      d26, r9                     @least idx added to final idx values
615    add         r6, sp, r4                  @ref_temp + nt
616
617    sub         r6, r6, #1                  @ref_temp + nt - 1
618    vmovn.s16   d6, q11
619    add         r6, r6, r9
620
621    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
622    mov         r0, #31
623
624    vdup.8      d29, r0                     @contains #31 for vand operation
625    mov         r1, #32
626
627    vdup.8      d28, r1
628
629    vshr.s16    q11, q11, #5
630    vqmovn.s16  d8, q11
631
632    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
633    vsub.s8     d7, d28, d6                 @32-fract
634
635    vadd.s8     d8, d8, d2                  @ref_main_idx (add 1)
636    vsub.s8     d8, d8, d26                 @ref_main_idx
637    vadd.s8     d9, d8, d2                  @ref_main_idx + 1
638
639    vadd.s8     d4, d8, d2                  @row 1 ref_main_idx
640    vadd.s8     d5, d9, d2
641
642    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
643    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
644
645
646    vmull.u8    q12, d12, d7                @mul (row 0)
647    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 1)
648    vmlal.u8    q12, d13, d6                @mul (row 0)
649
650    vadd.s8     d8, d8, d3                  @idx (row 2)
651    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
652    vadd.s8     d9, d9, d3                  @idx+1 (row 2)
653
654    vmull.u8    q11, d16, d7                @mul (row 1)
655    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 2)
656    vmlal.u8    q11, d17, d6                @mul (row 1)
657
658    vrshrn.i16  d24, q12, #5                @round shift (row 0)
659
660    vadd.s8     d4, d4, d3                  @idx (row 3)
661    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
662    vadd.s8     d5, d5, d3                  @idx+1 (row 3)
663
664    vmull.u8    q10, d12, d7                @mul (row 2)
665    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 3)
666    vmlal.u8    q10, d13, d6                @mul (row 2)
667
668    vst1.32     d24[0], [r2], r3            @st row 0
669    vrshrn.i16  d22, q11, #5                @round shift (row 1)
670
671    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
672
673    vmull.u8    q9, d16, d7                 @mul (row 3)
674    vmlal.u8    q9, d17, d6                 @mul (row 3)
675
676    vst1.32     d22[0], [r2], r3            @st row 1
677    vrshrn.i16  d20, q10, #5                @round shift (row 2)
678
679    vst1.32     d20[0], [r2], r3            @st row 2
680
681    vrshrn.i16  d18, q9, #5                 @round shift (row 3)
682
683    vst1.32     d18[0], [r2], r3            @st (row 3)
684
685end_func:
686    add         sp, sp, #132
687    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
688
689
690
691
692
693
694