1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@ *******************************************************************************
20@ * @file
21@ *  ihevc_padding_neon.s
22@ *
23@ * @brief
24@ *  contains function definitions padding
25@ *
26@ * @author
27@ *  naveen sr
28@ *
29@ * @par list of functions:
30@ *  - ihevc_pad_left_luma()
31@ *  - ihevc_pad_left_chroma()
32@ *
33@ * @remarks
34@ *  none
35@ *
36@ *******************************************************************************
37@*/
38
39@/**
40@*******************************************************************************
41@*
42@* @brief
43@*   padding (luma block) at the left of a 2d array
44@*
45@* @par description:
46@*   the left column of a 2d array is replicated for pad_size times at the left
47@*
48@*
49@* @param[in] pu1_src
50@*  uword8 pointer to the source
51@*
52@* @param[in] src_strd
53@*  integer source stride
54@*
55@* @param[in] ht
56@*  integer height of the array
57@*
58@* @param[in] wd
59@*  integer width of the array
60@*
61@* @param[in] pad_size
62@*  integer -padding size of the array
63@*
64@* @param[in] ht
65@*  integer height of the array
66@*
67@* @param[in] wd
68@*  integer width of the array
69@*
70@* @returns
71@*
72@* @remarks
73@*  none
74@*
75@*******************************************************************************
76@*/
77@.if pad_left_luma == c
78@void ihevc_pad_left_luma(uword8 *pu1_src,
79@                        word32 src_strd,
80@                        word32 ht,
81@                        word32 pad_size)
82@**************variables vs registers*************************
83@   r0 => *pu1_src
84@   r1 => src_strd
85@   r2 => ht
86@   r3 => pad_size
87
88.text
89.align 4
90
91
92
93
94.globl ihevc_pad_left_luma_a9q
95
96.type ihevc_pad_left_luma_a9q, %function
97
98ihevc_pad_left_luma_a9q:
99
100    stmfd       sp!, {r4-r11,lr}            @stack stores the values of the arguments
101
102loop_start_luma_left:
103    @ pad size is assumed to be pad_left = 80
104    sub         r4,r0,r3
105
106    ldrb        r8,[r0]
107    add         r0,r1
108    ldrb        r9,[r0]
109    add         r0,r1
110    ldrb        r10,[r0]
111    add         r0,r1
112    ldrb        r11,[r0]
113    add         r0,r1
114
115    vdup.u8     q0,r8
116    vdup.u8     q1,r9
117    vdup.u8     q2,r10
118    vdup.u8     q3,r11
119
120    add         r5,r4,r1
121
122    vst1.8      {d0,d1},[r4]!               @128/8 = 16 bytes store
123    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
124    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
125    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
126    vst1.8      {d0,d1},[r4]                @ 16 bytes store
127
128    add         r6,r5,r1
129
130    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
131    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
132    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
133    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
134    vst1.8      {d2,d3},[r5]                @128/8 = 16 bytes store
135
136    add         r7,r6,r1
137
138    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
139    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
140    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
141    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
142    vst1.8      {d4,d5},[r6]                @128/8 = 16 bytes store
143
144    subs        r2,#4
145
146    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
147    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
148    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
149    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
150    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
151
152    @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
153
154    bne         loop_start_luma_left
155
156    ldmfd       sp!,{r4-r11,pc}             @reload the registers from sp
157
158
159
160
161
162@/**
163@*******************************************************************************
164@*
165@* @brief
166@*   padding (chroma block) at the left of a 2d array
167@*
168@* @par description:
169@*   the left column of a 2d array is replicated for pad_size times at the left
170@*
171@*
172@* @param[in] pu1_src
173@*  uword8 pointer to the source
174@*
175@* @param[in] src_strd
176@*  integer source stride
177@*
178@* @param[in] ht
179@*  integer height of the array
180@*
181@* @param[in] wd
182@*  integer width of the array (each colour component)
183@*
184@* @param[in] pad_size
185@*  integer -padding size of the array
186@*
187@* @param[in] ht
188@*  integer height of the array
189@*
190@* @param[in] wd
191@*  integer width of the array
192@*
193@* @returns
194@*
195@* @remarks
196@*  none
197@*
198@*******************************************************************************
199@*/
200@.if pad_left_chroma == c
201@void ihevc_pad_left_chroma(uword8 *pu1_src,
202@                            word32 src_strd,
203@                            word32 ht,
204@                            word32 pad_size)
205@{
206@   r0 => *pu1_src
207@   r1 => src_strd
208@   r2 => ht
209@   r3 => pad_size
210
211
212
213.globl ihevc_pad_left_chroma_a9q
214
215.type ihevc_pad_left_chroma_a9q, %function
216
217ihevc_pad_left_chroma_a9q:
218
219    stmfd       sp!, {r4-r11, lr}           @stack stores the values of the arguments
220
221loop_start_chroma_left:
222    @ pad size is assumed to be pad_left = 80
223    sub         r4,r0,r3
224
225    ldrh        r8,[r0]
226    add         r0,r1
227    ldrh        r9,[r0]
228    add         r0,r1
229    ldrh        r10,[r0]
230    add         r0,r1
231    ldrh        r11,[r0]
232    add         r0,r1
233
234    vdup.u16    q0,r8
235    vdup.u16    q1,r9
236    vdup.u16    q2,r10
237    vdup.u16    q3,r11
238
239    add         r5,r4,r1
240
241    vst1.8      {d0,d1},[r4]!               @128/8 = 16 bytes store
242    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
243    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
244    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
245    vst1.8      {d0,d1},[r4]                @ 16 bytes store
246
247    add         r6,r5,r1
248
249    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
250    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
251    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
252    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
253    vst1.8      {d2,d3},[r5]                @128/8 = 16 bytes store
254
255    add         r7,r6,r1
256
257    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
258    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
259    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
260    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
261    vst1.8      {d4,d5},[r6]                @128/8 = 16 bytes store
262
263    subs        r2,#4
264
265    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
266    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
267    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
268    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
269    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
270
271    @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
272
273    bne         loop_start_chroma_left
274
275    ldmfd       sp!,{r4-r11,pc}             @reload the registers from sp
276
277
278
279
280
281@/**
282@*******************************************************************************
283@*
284@* @brief
285@* padding (luma block) at the right of a 2d array
286@*
287@* @par description:
288@* the right column of a 2d array is replicated for pad_size times at the right
289@*
290@*
291@* @param[in] pu1_src
292@*  uword8 pointer to the source
293@*
294@* @param[in] src_strd
295@*  integer source stride
296@*
297@* @param[in] ht
298@*  integer height of the array
299@*
300@* @param[in] wd
301@*  integer width of the array
302@*
303@* @param[in] pad_size
304@*  integer -padding size of the array
305@*
306@* @param[in] ht
307@*  integer height of the array
308@*
309@* @param[in] wd
310@*  integer width of the array
311@*
312@* @returns
313@*
314@* @remarks
315@*  none
316@*
317@*******************************************************************************
318@*/
319@.if pad_right_luma == c
320@void ihevc_pad_right_luma(uword8 *pu1_src,
321@                        word32 src_strd,
322@                        word32 ht,
323@                        word32 pad_size)
324@{
325@    word32 row@
326@
327@    for(row = 0@ row < ht@ row++)
328@    {
329@        memset(pu1_src, *(pu1_src -1), pad_size)@
330@
331@        pu1_src += src_strd@
332@    }
333@}
334@
335@   r0 => *pu1_src
336@   r1 => src_strd
337@   r2 => ht
338@   r3 => pad_size
339
340
341
342.globl ihevc_pad_right_luma_a9q
343
344.type ihevc_pad_right_luma_a9q, %function
345
346ihevc_pad_right_luma_a9q:
347
348    stmfd       sp!, {r4-r11, lr}           @stack stores the values of the arguments
349
350loop_start_luma_right:
351    @ pad size is assumed to be pad_left = 80
352    mov         r4,r0
353
354    ldrb        r8,[r0, #-1]
355    add         r0,r1
356    ldrb        r9,[r0, #-1]
357    add         r0,r1
358    ldrb        r10,[r0, #-1]
359    add         r0,r1
360    ldrb        r11,[r0, #-1]
361    add         r0,r1
362
363    add         r5,r4,r1
364    add         r6,r5,r1
365    add         r7,r6,r1
366
367    vdup.u8     q0,r8
368    vdup.u8     q1,r9
369    vdup.u8     q2,r10
370    vdup.u8     q3,r11
371
372    vst1.8      {d0,d1},[r4]!               @128/8 = 16 bytes store
373    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
374    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
375    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
376    vst1.8      {d0,d1},[r4]                @ 16 bytes store
377
378
379    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
380    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
381    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
382    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
383    vst1.8      {d2,d3},[r5]                @128/8 = 16 bytes store
384
385    subs        r2,#4
386
387    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
388    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
389    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
390    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
391    vst1.8      {d4,d5},[r6]                @128/8 = 16 bytes store
392
393    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
394    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
395    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
396    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
397    vst1.8      {d6,d7},[r7]                @128/8 = 16 bytes store
398
399
400    @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
401
402
403    bne         loop_start_luma_right
404
405    ldmfd       sp!,{r4-r11,pc}             @reload the registers from sp
406
407
408
409
410
411@/**
412@*******************************************************************************
413@*
414@* @brief
415@@* padding (chroma block) at the right of a 2d array
416@*
417@* @par description:
418@* the right column of a 2d array is replicated for pad_size times at the right
419@*
420@*
421@* @param[in] pu1_src
422@@*  uword8 pointer to the source
423@*
424@* @param[in] src_strd
425@*  integer source stride
426@*
427@* @param[in] ht
428@@*  integer height of the array
429@*
430@* @param[in] wd
431@*  integer width of the array (each colour component)
432@*
433@* @param[in] pad_size
434@*  integer -padding size of the array
435@*
436@* @param[in] ht
437@@*  integer height of the array
438@*
439@* @param[in] wd
440@*  integer width of the array
441@*
442@* @returns
443@*
444@* @remarks
445@*  none
446@*
447@*******************************************************************************
448@*/
449@.if pad_right_chroma == c
450@void ihevc_pad_right_chroma(uword8 *pu1_src,
451@                        word32 src_strd,
452@                        word32 ht,
453@                        word32 pad_size)
454@   r0 => *pu1_src
455@   r1 => src_strd
456@   r2 => ht
457@   r3 => pad_size
458
459
460
461.globl ihevc_pad_right_chroma_a9q
462
463.type ihevc_pad_right_chroma_a9q, %function
464
465ihevc_pad_right_chroma_a9q:
466
467    stmfd       sp!, {r4-r11, lr}           @stack stores the values of the arguments
468
469loop_start_chroma_right:
470    @ pad size is assumed to be pad_left = 80
471    mov         r4,r0
472
473    ldrh        r8,[r0, #-2]
474    add         r0,r1
475    ldrh        r9,[r0, #-2]
476    add         r0,r1
477    ldrh        r10,[r0, #-2]
478    add         r0,r1
479    ldrh        r11,[r0, #-2]
480    add         r0,r1
481
482    vdup.u16    q0,r8
483    vdup.u16    q1,r9
484    vdup.u16    q2,r10
485    vdup.u16    q3,r11
486
487    add         r5,r4,r1
488
489    vst1.8      {d0,d1},[r4]!               @128/8 = 16 bytes store
490    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
491    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
492    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
493    vst1.8      {d0,d1},[r4]                @ 16 bytes store
494
495    add         r6,r5,r1
496
497    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
498    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
499    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
500    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
501    vst1.8      {d2,d3},[r5]                @128/8 = 16 bytes store
502
503    add         r7,r6,r1
504
505    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
506    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
507    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
508    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
509    vst1.8      {d4,d5},[r6]                @128/8 = 16 bytes store
510
511    subs        r2,#4
512
513    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
514    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
515    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
516    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
517    vst1.8      {d6,d7},[r7]                @128/8 = 16 bytes store
518
519    @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
520
521    bne         loop_start_chroma_right
522
523    ldmfd       sp!,{r4-r11,pc}             @reload the registers from sp
524
525
526
527
528
529
530
531
532