1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_filters_dc.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  akshaya mukund
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] pi1_coeff
61@*  word8 pointer to the planar coefficients
62@*
63@* @param[in] nt
64@*  size of tranform block
65@*
66@* @param[in] mode
67@*  type of filtering
68@*
69@* @returns
70@*
71@* @remarks
72@*  none
73@*
74@*******************************************************************************
75@*/
76
77@void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
78@                              word32 src_strd,
79@                              uword8 *pu1_dst,
80@                              word32 dst_strd,
81@                              word32 nt,
82@                              word32 mode)
83@
84@**************variables vs registers*****************************************
85@r0 => *pu1_ref
86@r1 => src_strd
87@r2 => *pu1_dst
88@r3 => dst_strd
89
90@stack contents from #40
91@   nt
92@   mode
93@   pi1_coeff
94
95.text
96.align 4
97
98
99
100
101.globl ihevc_intra_pred_luma_dc_a9q
102
103.type ihevc_intra_pred_luma_dc_a9q, %function
104
105ihevc_intra_pred_luma_dc_a9q:
106
107    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
108
109    ldr         r4,[sp,#40]                 @loads nt
110
111@********** testing
112    @mov        r6, #128
113    @b      prologue_cpy_32
114@********** testing
115
116    mov         r11, #2                     @mov #2 to r11 (to be used to add to 2dc_val & 3dc_val)
117    mov         r9, #0
118    vmov        d17, r11, r9
119
120    clz         r5, r4
121
122    add         r6, r0, r4                  @&src[nt]
123    rsb         r5, r5, #32                 @log2nt
124    add         r7, r0, r4, lsl #1          @&src[2nt]
125
126    add         r8, r7, #1                  @&src[2nt+1]
127    mvn         r5, r5
128    add         r5, r5, #1
129    vdup.32     d8, r5
130
131    ldrb        r14, [r8]
132    vshl.i64    d8, d8, #32
133
134    sub         r9, r7, #1                  @&src[2nt-1]
135    vshr.s64    d8, d8, #32
136
137    mov         r7, r8                      @r7 also stores 2nt+1
138
139    ldrb        r12, [r9]
140    add         r14, r14, r12               @src[2nt+1] + src[2nt-1]
141    add         r14, r14, r11               @src[2nt+1] + src[2nt-1] + 2
142
143    cmp         r4, #4
144    beq         dc_4
145
146    mov         r10, r4                     @nt
147
148add_loop:
149    vld1.s8     d0, [r6]!                   @load from src[nt]
150    mov         r5, #0                      @
151    vld1.s8     d1, [r8]!                   @load from src[2nt+1]
152
153    vpaddl.u8   d2, d0
154
155    vmov        d6, r4, r5                  @store nt to accumulate
156    vpaddl.u8   d3, d1
157
158    vld1.s8     d0, [r6]!                   @load from src[nt] (extra load for 8)
159
160    vld1.s8     d1, [r8]!                   @load from src[2nt+1] (extra load for 8)
161    vadd.u16    d4, d2, d3
162
163
164    vpaddl.u16  d5, d4
165
166
167    vpadal.u32  d6, d5                      @accumulate all inp into d6 (end for nt==8)
168
169    subs        r10, #8
170    beq         epil_add_loop
171
172core_loop_add:
173    vpaddl.u8   d2, d0
174    subs        r10, #8
175    vpaddl.u8   d3, d1
176
177
178
179    vadd.u16    d4, d2, d3
180    vld1.s8     d0, [r6]!                   @load from src[nt] (extra load for 16)
181
182    vpaddl.u16  d5, d4
183    vld1.s8     d1, [r8]!                   @load from src[2nt+1] (extra load for 16)
184
185    vpadal.u32  d6, d5                      @accumulate all inp into d6
186    bne         core_loop_add
187
188epil_add_loop:
189
190    vshl.s64    d9, d6, d8                  @(dc_val) shr by log2nt+1
191    cmp         r4, #32
192
193    vmov        d28, r14, r5                @src[2nt+1]+2+src[2nt-1] moved to d28
194    moveq       r6, #128
195
196    vdup.8      d16, d9[0]                  @dc_val
197    vshl.s64    d13, d9, #1                 @2*dc
198
199    beq         prologue_cpy_32
200
201    vadd.i64    d14, d13, d28               @src[2nt+1]+2+src[2nt-1]+2dc_val
202    movne       r6, #0                      @nt
203
204    vshr.u16    d15, d14, #2                @final dst[0]'s value in d15[0]
205    movne       r10, r4
206
207    vadd.i64    d11, d13, d9                @3*dc
208    sub         r12, r3, r3, lsl #3         @-7*strd
209
210    vadd.i64    d11, d11, d17               @3*dc + 2
211    add         r12, r12, #8                @offset after one 8x8 block (-7*strd + 8)
212
213    vdup.16     q12, d11[0]                 @3*dc + 2 (moved to all lanes)
214    sub         r0, r3, r4                  @strd - nt
215
216prologue_col:
217    @0th column and 0-7 rows done here
218    @r8 and r9 (2nt+1+col 2nt-1-row)
219
220    mov         r8, r7                      @&src[2nt+1]
221
222    add         r0, r0, #8                  @strd - nt + 8
223    vld1.s8     d0, [r8]!                   @col 1::7 load (prol)
224    sub         r9, r9, #7                  @&src[2nt-1-row]
225
226    vld1.s8     d1, [r9]                    @row 7::1 (0 also) load (prol)
227    sub         r9, r9, #8
228
229    vmovl.u8    q10, d0
230
231    vld1.s8     d6, [r8]                    @col 8::15 load (prol extra)
232    vadd.i16    q10, q10, q12               @col 1::7 add 3dc+2 (prol)
233
234    vmovl.u8    q11, d1
235    vqshrun.s16 d2, q10, #2                 @columns shr2 movn (prol)
236
237    vmovl.u8    q13, d6
238    vadd.i16    q11, q11, q12               @row 1::7 add 3dc+2 (prol)
239
240    vmov.i64    d19, #0x00000000000000ff    @
241    vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)
242
243    vbsl        d19, d15, d2                @first row with dst[0]
244    vadd.i16    q13, q13, q12               @col 8::15 add 3dc+2 (prol extra)
245
246    vrev64.8    d3, d3
247
248    vst1.8      d19, [r2], r3               @store row 0 (prol)
249    vshr.s64    d3, d3, #8                  @row 0 shift (prol) (first value to be ignored)
250
251    vmov.i64    d20, #0x00000000000000ff    @byte mask row 1 (prol)
252
253loop_again_col_row:
254
255    vbsl        d20, d3, d16                @row 1  (prol)
256
257    vmov.i64    d21, #0x00000000000000ff    @byte mask row 2 (prol)
258    vshr.s64    d3, d3, #8                  @row 1 shift (prol)
259
260    vst1.8      d20, [r2], r3               @store row 1 (prol)
261    vqshrun.s16 d4, q13, #2                 @columns shr2 movn (prol extra)
262
263
264    vbsl        d21, d3, d16                @row 2 (prol)
265
266    vmov.i64    d20, #0x00000000000000ff    @byte mask row 3 (prol)
267    vshr.s64    d3, d3, #8                  @row 2 shift (prol)
268
269    vst1.8      d21, [r2], r3               @store row 2 (prol)
270
271
272    vbsl        d20, d3, d16                @row 3  (prol)
273
274    vmov.i64    d21, #0x00000000000000ff    @byte mask row 4 (prol)
275    vshr.s64    d3, d3, #8                  @row 3 shift (prol)
276
277    vst1.8      d20, [r2], r3               @store row 3 (prol)
278
279
280    vbsl        d21, d3, d16                @row 4 (prol)
281
282    vmov.i64    d20, #0x00000000000000ff    @byte mask row 5 (prol)
283    vshr.s64    d3, d3, #8                  @row 4 shift (prol)
284
285    vst1.8      d21, [r2], r3               @store row 4 (prol)
286
287
288    vbsl        d20, d3, d16                @row 5 (prol)
289
290    vmov.i64    d21, #0x00000000000000ff    @byte mask row 6 (prol)
291    vshr.s64    d3, d3, #8                  @row 5 shift (prol)
292
293    vst1.8      d20, [r2], r3               @store row 5 (prol)
294
295    vld1.s8     d1, [r9]                    @row 8::15 load (prol extra)
296
297    vbsl        d21, d3, d16                @row 6 (prol)
298
299    vmovl.u8    q11, d1
300
301    vmov.i64    d20, #0x00000000000000ff    @byte mask row 7 (prol)
302    vshr.s64    d3, d3, #8                  @row 6 shift (prol)
303
304    vst1.8      d21, [r2], r3               @store row 6 (prol)
305
306    vbsl        d20, d3, d16                @row 7 (prol)
307    vadd.i16    q11, q11, q12               @row 8::15 add 3dc+2 (prol extra)
308
309    vshr.s64    d3, d3, #8                  @row 7 shift (prol)
310    vst1.8      d20, [r2], r12              @store row 7 (prol)
311
312    subs        r10, r10, #8                @counter for cols
313
314    beq         end_func
315    blt         copy_16
316
317
318    vmov.i64    d20, #0x00000000000000ff    @byte mask row 9 (prol)
319    vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)
320
321    vrev64.8    d3, d3
322
323    vst1.8      d4, [r2], r3                @store 2nd col (for 16x16)
324
325    vst1.8      d16, [r2], r3
326    vst1.8      d16, [r2], r3
327    vst1.8      d16, [r2], r3
328    vst1.8      d16, [r2], r3
329    vst1.8      d16, [r2], r3
330    vst1.8      d16, [r2], r3
331    vst1.8      d16, [r2], r0               @go to next row for 16
332
333
334    vbsl        d20, d3, d16                @row 9  (prol)
335    subs        r10, r10, #8
336
337    vst1.8      d20, [r2], r3               @store row 9 (prol)
338    vshr.s64    d3, d3, #8                  @row 9 shift (prol)
339
340    vmov.i64    d20, #0x00000000000000ff    @byte mask row 9 (prol)
341
342    b           loop_again_col_row
343
344
345copy_16:
346    vst1.8      d16, [r2], r3
347    vst1.8      d16, [r2], r3
348    vst1.8      d16, [r2], r3
349    vst1.8      d16, [r2], r3
350    vst1.8      d16, [r2], r3
351    vst1.8      d16, [r2], r3
352    vst1.8      d16, [r2], r3
353    vst1.8      d16, [r2]
354
355    b           end_func
356
357prologue_cpy_32:
358    mov         r9, #128
359    @sub        r7, r3, #-24
360    add         r5, r2, r3
361    add         r8, r5, r3
362    add         r10, r8, r3
363    vdup.8      q10, d16[0]
364    lsl         r6, r3, #2
365    add         r6, r6, #0xfffffff0
366
367    vst1.8      {d20,d21}, [r2]!
368    vst1.8      {d20,d21}, [r5]!
369    vst1.8      {d20,d21}, [r8]!
370    vst1.8      {d20,d21}, [r10]!
371
372    vst1.8      {d20,d21}, [r2], r6
373    vst1.8      {d20,d21}, [r5], r6
374    vst1.8      {d20,d21}, [r8], r6
375    vst1.8      {d20,d21}, [r10], r6
376
377    sub         r9, r9, #32                 @32x32 prol/epil counter dec
378
379kernel_copy:
380    vst1.8      {d20,d21}, [r2]!
381    vst1.8      {d20,d21}, [r5]!
382    vst1.8      {d20,d21}, [r8]!
383    vst1.8      {d20,d21}, [r10]!
384
385    vst1.8      {d20,d21}, [r2], r6
386    vst1.8      {d20,d21}, [r5], r6
387    vst1.8      {d20,d21}, [r8], r6
388    vst1.8      {d20,d21}, [r10], r6
389
390    subs        r9, r9, #32
391
392    vst1.8      {d20,d21}, [r2]!
393    vst1.8      {d20,d21}, [r5]!
394    vst1.8      {d20,d21}, [r8]!
395    vst1.8      {d20,d21}, [r10]!
396
397    vst1.8      {d20,d21}, [r2], r6
398    vst1.8      {d20,d21}, [r5], r6
399    vst1.8      {d20,d21}, [r8], r6
400    vst1.8      {d20,d21}, [r10], r6
401
402    bne         kernel_copy
403
404epilogue_copy:
405    vst1.8      {d20,d21}, [r2]!
406    vst1.8      {d20,d21}, [r5]!
407    vst1.8      {d20,d21}, [r8]!
408    vst1.8      {d20,d21}, [r10]!
409
410    vst1.8      {d20,d21}, [r2]
411    vst1.8      {d20,d21}, [r5]
412    vst1.8      {d20,d21}, [r8]
413    vst1.8      {d20,d21}, [r10]
414
415    b           end_func
416
417
418dc_4:
419    vld1.s8     d0, [r6]!                   @load from src[nt]
420    vld1.s8     d1, [r8]!                   @load from src[2nt+1]
421
422    vpaddl.u8   d2, d0
423    mov         r5, #0                      @
424    vmov        d6, r4, r5                  @store nt to accumulate
425    vpaddl.u8   d3, d1
426
427    vadd.u16    d4, d2, d3
428
429
430    vpaddl.u16  d5, d4
431    vmov.i64    d30, #0x00000000ffffffff
432
433    vand        d5, d5, d30
434
435    vmov        d28, r14, r5                @src[2nt+1]+2+src[2nt-1] moved to d28
436    vadd.i64    d6, d6, d5                  @accumulate all inp into d6 (end for nt==8)
437
438    vshl.s64    d9, d6, d8                  @(dc_val) shr by log2nt+1
439    mov         r8, r7                      @&src[2nt+1]
440
441    vshl.s64    d13, d9, #1                 @2*dc
442    sub         r9, r9, #3                  @&src[2nt-1-row]
443
444    vdup.8      d16, d9[0]                  @dc_val
445    vadd.i64    d14, d13, d28               @src[2nt+1]+2+src[2nt-1]+2dc_val
446
447    vshr.u16    d15, d14, #2                @final dst[0]'s value in d15[0]
448    sub         r12, r3, r3, lsl #2         @-3*strd
449    vadd.i64    d11, d13, d9                @3*dc
450
451    vadd.i64    d11, d11, d17               @3*dc + 2
452    add         r12, r12, #4                @offset after one 4x4 block (-3*strd + 4)
453
454    vdup.16     q12, d11[0]                 @3*dc + 2 (moved to all lanes)
455    sub         r0, r3, r4                  @strd - nt
456
457
458    vld1.s8     d0, [r8]                    @col 1::3 load (prol)
459    vld1.s8     d1, [r9]                    @row 3::1 (0 also) load (prol)
460
461    vmovl.u8    q10, d0
462
463    vmovl.u8    q11, d1
464    vadd.i16    q10, q10, q12               @col 1::7 add 3dc+2 (prol)
465
466    vadd.i16    q11, q11, q12               @row 1::7 add 3dc+2 (prol)
467
468    vmov.i64    d19, #0x00000000000000ff    @
469    vqshrun.s16 d2, q10, #2                 @columns shr2 movn (prol)
470
471    vmov.i64    d20, #0x00000000000000ff    @byte mask row 1 (prol)
472    vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)
473
474
475    vbsl        d19, d15, d2                @first row with dst[0]
476
477    vrev64.8    d3, d3
478
479    vst1.32     d19[0], [r2], r3            @store row 0 (prol)
480    vshr.s64    d3, d3, #40                 @row 0 shift (prol) (first value to be ignored)
481
482    vmov.i64    d21, #0x00000000000000ff    @byte mask row 2 (prol)
483
484    vbsl        d20, d3, d16                @row 1  (prol)
485    vshr.s64    d3, d3, #8                  @row 1 shift (prol)
486
487    vst1.32     d20[0], [r2], r3            @store row 1 (prol)
488
489    vbsl        d21, d3, d16                @row 2 (prol)
490
491    vmov.i64    d20, #0x00000000000000ff    @byte mask row 3 (prol)
492
493    vshr.s64    d3, d3, #8                  @row 2 shift (prol)
494    vst1.32     d21[0], [r2], r3            @store row 2 (prol)
495
496    vbsl        d20, d3, d16                @row 3  (prol)
497    vst1.32     d20[0], [r2]                @store row 3 (prol)
498
499epilogue_end:
500end_func:
501    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
502
503
504
505
506
507
508
509