1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_chroma_mode_3_to_9.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  parthiban v
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] nt
61@*  size of tranform block
62@*
63@* @param[in] mode
64@*  type of filtering
65@*
66@* @returns
67@*
68@* @remarks
69@*  none
70@*
71@*******************************************************************************
72@*/
73@void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref,
74@                                       word32 src_strd,
75@                                       uword8 *pu1_dst,
76@                                       word32 dst_strd,
77@                                       word32 nt,
78@                                       word32 mode)
79@**************variables vs registers*****************************************
80@r0 => *pu1_ref
81@r1 => src_strd
82@r2 => *pu1_dst
83@r3 => dst_strd
84
85@stack contents from #104
86@   nt
87@   mode
88
89.equ    nt_offset,          104
90.equ    mode_offset,        108
91
92.text
93.align 4
94
95
96
97
98
99.globl ihevc_intra_pred_chroma_mode_3_to_9_a9q
100.extern gai4_ihevc_ang_table
101.extern gai4_ihevc_inv_ang_table
102.extern col_for_intra_chroma
103.extern idx_neg_idx_chroma_3_9
104
105gai4_ihevc_ang_table_addr:
106.long gai4_ihevc_ang_table - ulbl1 - 8
107
108gai4_ihevc_inv_ang_table_addr:
109.long gai4_ihevc_inv_ang_table - ulbl2 - 8
110
111
112idx_neg_idx_chroma_3_9_addr:
113.long idx_neg_idx_chroma_3_9 - ulbl3 - 8
114
115col_for_intra_chroma_addr_1:
116.long col_for_intra_chroma - ulbl4 - 8
117
118col_for_intra_chroma_addr_2:
119.long col_for_intra_chroma - ulbl5 - 8
120
121col_for_intra_chroma_addr_3:
122.long col_for_intra_chroma - ulbl6 - 8
123
124.type ihevc_intra_pred_chroma_mode_3_to_9_a9q, %function
125
126ihevc_intra_pred_chroma_mode_3_to_9_a9q:
127
128    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
129    vpush       {d8 - d15}
130
131    ldr         r4,[sp,#nt_offset]          @loads nt
132    ldr         r7, gai4_ihevc_ang_table_addr
133ulbl1:
134    add         r7,r7,pc
135
136    ldr         r5,[sp,#mode_offset]        @mode (3 to 9)
137    ldr         r8, gai4_ihevc_inv_ang_table_addr
138ulbl2:
139    add         r8,r8,pc
140
141    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
142    ldr         r7, [r7]                    @intra_pred_ang
143    vdup.8      d30, r7                     @intra_pred_ang
144
145    ldr         r14, col_for_intra_chroma_addr_1
146ulbl4:
147    add         r14,r14,pc
148
149prologue_8_16_32:
150    lsr         r10, r4, #3
151    vld1.8      d31, [r14]!
152    mul         r10, r4, r10                @block counter (dec by #8)
153
154    mov         r11, r4, lsl #1             @col counter to be inc/dec by #8
155    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
156
157    sub         r7, r5, #3
158    ldr         r12, idx_neg_idx_chroma_3_9_addr @load most idx table
159ulbl3:
160    add         r12,r12,pc
161
162    add         r12, r12, r7, lsl #4
163    mov         r8, r12
164
165    mov         r7, #8
166    sub         r7, r7, r3, lsl #3          @r7 = 8-8r3
167
168    ldr         r9, [r8]
169    mov         r9, r9, lsl #1
170    add         r1, r0, r4, lsl #2          @pu1_ref + 4*nt
171
172    vmovn.s16   d6, q11
173    vdup.8      d26, r9                     @most idx added to final idx values
174    sub         r1, r1, #26                 @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
175
176    sub         r6, r1, r9
177
178    vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from most idx)
179    vshr.s16    q11, q11, #5
180
181    vmov.i8     d29, #31                    @contains #31 for vand operation
182
183    vmov.i8     d28, #32
184
185    vqmovn.s16  d8, q11
186    vshl.s8     d8, d8, #1                  @ 2 * idx
187
188    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
189    vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
190
191    movw        r0,#0x302                   @ idx value for v is +1 of u
192    vdup.u16    d27,r0
193    mov         r0,#0
194
195    vmov.i8     d9, #22                     @row 0 to 7
196
197    vsub.s8     d8, d8, d27                 @ref_main_idx (sub row)
198    vsub.s8     d8, d26, d8                 @ref_main_idx (row 0)
199    vadd.s8     d8, d8, d9                  @to compensate the pu1_src idx incremented by 8
200    vsub.s8     d9, d8, d29                 @ref_main_idx + 1 (row 0)
201    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
202    vsub.s8     d7, d28, d6                 @32-fract
203
204    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
205    vsub.s8     d4, d8, d29                 @ref_main_idx (row 1)
206    vsub.s8     d5, d9, d29                 @ref_main_idx + 1 (row 1)
207
208    vmov.i8     d29, #4
209
210    vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
211    vmull.u8    q12, d12, d7                @mul (row 0)
212    vmlal.u8    q12, d13, d6                @mul (row 0)
213
214    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
215    vsub.s8     d8, d8, d29                 @ref_main_idx (row 2)
216    vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 2)
217
218    vrshrn.i16  d24, q12, #5                @round shft (row 0)
219
220    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
221    vmull.u8    q11, d16, d7                @mul (row 1)
222    vmlal.u8    q11, d17, d6                @mul (row 1)
223
224    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
225    vsub.s8     d4, d4, d29                 @ref_main_idx (row 3)
226    vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 3)
227
228    vst1.8      d24, [r2], r3               @st (row 0)
229    vrshrn.i16  d22, q11, #5                @round shft (row 1)
230
231    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
232    vmull.u8    q10, d14, d7                @mul (row 2)
233    vmlal.u8    q10, d15, d6                @mul (row 2)
234
235    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
236    vsub.s8     d8, d8, d29                 @ref_main_idx (row 4)
237    vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 4)
238
239    vst1.8      d22, [r2], r3               @st (row 1)
240    vrshrn.i16  d20, q10, #5                @round shft (row 2)
241
242    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
243    vmull.u8    q9, d10, d7                 @mul (row 3)
244    vmlal.u8    q9, d11, d6                 @mul (row 3)
245
246    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
247    vsub.s8     d4, d4, d29                 @ref_main_idx (row 5)
248    vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 5)
249
250    vst1.8      d20, [r2], r3               @st (row 2)
251    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
252
253    vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
254    vmull.u8    q12, d12, d7                @mul (row 4)
255    vmlal.u8    q12, d13, d6                @mul (row 4)
256
257    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
258    vsub.s8     d8, d8, d29                 @ref_main_idx (row 6)
259    vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 6)
260
261    vst1.8      d18, [r2], r3               @st (row 3)
262    cmp         r4,#4
263    beq         end_func
264    vrshrn.i16  d24, q12, #5                @round shft (row 4)
265
266    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
267    vmull.u8    q11, d16, d7                @mul (row 5)
268    vmlal.u8    q11, d17, d6                @mul (row 5)
269
270    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
271    vsub.s8     d4, d4, d29                 @ref_main_idx (row 7)
272    vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 7)
273
274    vst1.8      d24, [r2], r3               @st (row 4)
275    vrshrn.i16  d22, q11, #5                @round shft (row 5)
276
277    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
278    vmull.u8    q10, d14, d7                @mul (row 6)
279    vmlal.u8    q10, d15, d6                @mul (row 6)
280
281    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
282    vmull.u8    q9, d10, d7                 @mul (row 7)
283    vmlal.u8    q9, d11, d6                 @mul (row 7)
284
285    vst1.8      d22, [r2], r3               @st (row 5)
286    vrshrn.i16  d20, q10, #5                @round shft (row 6)
287    vrshrn.i16  d18, q9, #5                 @round shft (row 7)
288
289    vst1.8      d20, [r2], r3               @st (row 6)
290
291    subs        r10, r10, #4                @subtract 8 and go to end if 8x8
292
293    vst1.8      d18, [r2], r3               @st (row 7)
294
295    beq         end_func
296
297    subs        r11, r11, #8                @decrement the processed col
298    addgt       r8, r8, #4
299    addgt       r2, r2, r7
300    movle       r8, r12
301    suble       r2, r2, r4
302    addle       r2, r2, #8
303    movle       r11, r4, lsl #1
304    ldrle       r14, col_for_intra_chroma_addr_2
305ulbl5:
306    addle       r14,r14,pc
307    addle       r0, r0, #8
308
309    vld1.8      d31, [r14]!
310    vmull.s8    q6, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
311    vmovn.s16   d10, q6
312    vshr.s16    q6, q6, #5
313    vqmovn.s16  d11, q6
314    vshl.s8     d11, d11, #1
315    movw        r5, #0x302                  @idx value for v is +1 of u
316    vdup.u16    d27, r5                     @row value inc or reset accordingly
317    ldr         r9, [r8]                    @loads index value
318    mov         r9, r9, lsl #1
319    mov         r5, #22
320    sub         r5, r5, r0, lsl #1
321    vdup.8      d16, r5
322    vdup.8      d26, r9
323
324    mov         r5,r2
325    vsub.s8     d11, d11, d27               @ref_main_idx (sub row)
326
327kernel_8_16_32:
328    vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
329    vsub.s8     d8, d26, d11                @ref_main_idx
330    vmov        d26,d10
331
332    subs        r11, r11, #8
333    sub         r6, r1, r9
334    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
335    vadd.s8     d8, d8, d16                 @to compensate the pu1_src idx incremented by 8
336
337    vmull.u8    q10, d14, d7                @mul (row 6)
338    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx - 1 (row 7)
339    vmlal.u8    q10, d15, d6                @mul (row 6)
340
341    addle       r0, r0, #8
342    vsub.s8     d9, d8, d29                 @ref_main_idx - 2
343    addgt       r8, r8, #4
344
345    vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from most idx)
346    vrshrn.i16  d22, q11, #5                @round shft (row 5)
347
348    ldrle       r14, col_for_intra_chroma_addr_3
349ulbl6:
350    addle       r14,r14,pc
351    vst1.8      d24, [r5], r3               @st (row 4)
352    movle       r8, r12
353
354    movw        r9,#0x302
355    vdup.16     d27, r9                     @row value inc or reset accordingly
356    vsub.s8     d4, d8, d29                 @ref_main_idx (row 1)
357
358    vsub.s8     d5, d9, d29                 @ref_main_idx - 1 (row 1)
359    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
360    vmov.i8     d29, #31                    @contains #2 for adding to get ref_main_idx + 1
361
362    vmull.u8    q9, d10, d7                 @mul (row 7)
363    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
364    vmlal.u8    q9, d11, d6                 @mul (row 7)
365
366    vld1.8      d31, [r14]!
367    vand        d6, d29, d26                @fract values in d1/ idx values in d0
368
369    movle       r11, r4, lsl #1
370    vmov.i8     d29, #4                     @contains #2 for adding to get ref_main_idx + 1
371    ldr         r9, [r8]
372
373    vst1.8      d22, [r5], r3               @(from previous loop)st (row 5)
374    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
375
376    vsub.s8     d8, d8, d29                 @ref_main_idx (row 2)
377    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
378    vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 2)
379
380    mov         r9,r9,lsl #1
381    vsub.s8     d7, d28, d6                 @32-fract
382
383    vmull.u8    q12, d12, d7                @mul (row 0)
384    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
385    vmlal.u8    q12, d13, d6                @mul (row 0)
386
387    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
388    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
389
390    vsub.s8     d4, d4, d29                 @ref_main_idx (row 3)
391    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
392    vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 3)
393
394    vmull.u8    q11, d10, d7                @mul (row 1)
395    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
396    vmlal.u8    q11, d17, d6                @mul (row 1)
397
398    vrshrn.i16  d24, q12, #5                @round shft (row 0)
399    vst1.8      d18, [r5], r3               @(from previous loop)st (row 7)
400
401    vsub.s8     d8, d8, d29                 @ref_main_idx (row 4)
402    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
403    vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 4)
404
405    vmull.u8    q10, d14, d7                @mul (row 2)
406    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
407    vmlal.u8    q10, d15, d6                @mul (row 2)
408
409    add         r5,r2,r3,lsl#2
410    vmull.s8    q7, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
411    add         r9, r9, r0, lsl #1
412
413    vst1.8      d24, [r2], r3               @st (row 0)
414    vrshrn.i16  d22, q11, #5                @round shft (row 1)
415
416    vsub.s8     d4, d4, d29                 @ref_main_idx (row 5)
417    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
418    vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 5)
419
420    vmull.u8    q9, d10, d7                 @mul (row 3)
421    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
422    vmlal.u8    q9, d11, d6                 @mul (row 3)
423
424    vst1.8      d22, [r2], r3               @st (row 1)
425    vrshrn.i16  d20, q10, #5                @round shft (row 2)
426
427    vmovn.s16   d10, q7
428    vshr.s16    q7, q7, #5
429
430    vsub.s8     d8, d8, d29                 @ref_main_idx (row 6)
431    vtbl.8      d21, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
432    vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 6)
433
434    vmull.u8    q12, d12, d7                @mul (row 4)
435    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
436    vqmovn.s16  d11, q7
437
438    vst1.8      d20, [r2], r3               @st (row 2)
439    vmlal.u8    q12, d13, d6                @mul (row 4)
440
441    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
442    vdup.8      d26, r9
443
444    vsub.s8     d4, d4, d29                 @ref_main_idx (row 7)
445    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
446    vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 7)
447
448    mov         r6, #22                     @to compensate the 2*row value
449    vshl.u8     d11,#1
450    sub         r6, r6, r0, lsl #1
451
452    vmull.u8    q11, d21, d7                @mul (row 5)
453    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
454    vmlal.u8    q11, d17, d6                @mul (row 5)
455
456    vst1.8      d18, [r2], r3               @st (row 3)
457    vrshrn.i16  d24, q12, #5                @round shft (row 4)
458
459    add         r2,r2,r3, lsl #2
460    vdup.8      d16, r6
461    addgt       r2, r7, r2
462
463    suble       r2, r2, r4
464    vsub.s8     d11, d11, d27               @ref_main_idx (add row)
465    suble       r2,r2,#8
466
467    subs        r10, r10, #4                @subtract 8 and go to end if 8x8
468
469    bne         kernel_8_16_32
470
471epil_8_16_32:
472    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
473
474    vmull.u8    q10, d14, d7                @mul (row 6)
475    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
476    vmlal.u8    q10, d15, d6                @mul (row 6)
477
478    vst1.8      d24, [r5], r3               @st (row 4)
479    vrshrn.i16  d24, q11, #5                @round shft (row 5)
480
481    vmull.u8    q9, d10, d7                 @mul (row 7)
482    vmlal.u8    q9, d11, d6                 @mul (row 7)
483
484    vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
485    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
486
487    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
488    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
489
490    vst1.8      d18, [r5], r3               @st (row 7)
491
492end_func:
493    vpop        {d8 - d15}
494    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
495
496
497
498
499
500
501
502
503