1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_luma_mode2_neon.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] pi1_coeff
61@*  word8 pointer to the planar coefficients
62@*
63@* @param[in] nt
64@*  size of tranform block
65@*
66@* @param[in] mode
67@*  type of filtering
68@*
69@* @returns
70@*
71@* @remarks
72@*  none
73@*
74@*******************************************************************************
75@*/
76
77@void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
78@                                 word32 src_strd,
79@                                 uword8 *pu1_dst,
80@                                 word32 dst_strd,
81@                                 word32 nt,
82@                                 word32 mode)
83@
84@**************variables vs registers*****************************************
85@r0 => *pu1_ref
86@r1 => src_strd
87@r2 => *pu1_dst
88@r3 => dst_strd
89
90@stack contents from #40
91@   nt
92@   mode
93@   pi1_coeff
94
95.text
96.align 4
97
98
99
100
101.globl ihevc_intra_pred_chroma_mode2_a9q
102
103.type ihevc_intra_pred_chroma_mode2_a9q, %function
104
105ihevc_intra_pred_chroma_mode2_a9q:
106
107    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
108
109    ldr         r4,[sp,#40]                 @loads nt
110    mov         r8,#-4
111
112    cmp         r4,#4
113    beq         mode2_4
114
115    add         r0,r0,r4,lsl #2
116
117    sub         r0,r0,#0x12                 @src[1]
118    add         r10,r0,#-2
119
120prologue_cpy_32:
121
122    vld2.8      {d0,d1},[r0],r8
123
124    mov         r11,r4
125    vrev64.8    d16,d0
126    vrev64.8    d17,d1
127
128    vld2.8      {d2,d3},[r10],r8
129    mov         r6, r2
130
131    vld2.8      {d4,d5},[r0],r8
132    vld2.8      {d6,d7},[r10],r8
133    lsr         r1, r4, #3
134
135    vld2.8      {d8,d9},[r0],r8
136    vld2.8      {d10,d11},[r10],r8
137    vld2.8      {d12,d13},[r0],r8
138    mul         r1, r4, r1
139
140    vld2.8      {d14,d15},[r10],r8
141    add         r7,r6,r3
142
143    vrev64.8    d18,d2
144    vrev64.8    d19,d3
145    lsl         r5, r3, #2
146
147    vrev64.8    d20,d4
148    vrev64.8    d21,d5
149    add         r9,r7,r3
150
151    vrev64.8    d22,d6
152    vrev64.8    d23,d7
153
154    vrev64.8    d24,d8
155    vrev64.8    d25,d9
156
157    vrev64.8    d26,d10
158    subs        r1,r1,#8
159
160    vrev64.8    d27,d11
161
162    vrev64.8    d28,d12
163    vrev64.8    d29,d13
164
165    vrev64.8    d30,d14
166    add         r14,r9,r3
167    vrev64.8    d31,d15
168
169    beq         epilogue_mode2
170
171    sub         r12,r4,#8
172
173kernel_mode2:
174
175    vst2.8      {d16,d17},[r6],r5
176    vst2.8      {d18,d19},[r7],r5
177    subs        r11,r11,#8
178    vst2.8      {d20,d21},[r9],r5
179    vst2.8      {d22,d23},[r14],r5
180    vst2.8      {d24,d25},[r6],r5
181    addgt       r2,r2,#16
182    vst2.8      {d26,d27},[r7],r5
183    vst2.8      {d28,d29},[r9],r5
184    vst2.8      {d30,d31},[r14],r5
185
186    vld2.8      {d0,d1},[r0],r8
187    movle       r11,r4
188
189    vld2.8      {d2,d3},[r10],r8
190    vld2.8      {d4,d5},[r0],r8
191    addle       r2, r2, r3, lsl #2
192    vld2.8      {d6,d7},[r10],r8
193    vrev64.8    d16,d0
194
195    vld2.8      {d8,d9},[r0],r8
196    vld2.8      {d10,d11},[r10],r8
197    suble       r2, r6,#16
198    vld2.8      {d12,d13},[r0],r8
199    vrev64.8    d17,d1
200    vld2.8      {d14,d15},[r10],r8
201
202    subs        r12,r12,#8
203    mov         r6, r2
204    addle       r0, r0, r4,lsl #1
205    add         r7, r6, r3
206
207    vrev64.8    d18,d2
208    suble       r0, r0, #16
209    vrev64.8    d19,d3
210
211    vrev64.8    d20,d4
212    movle       r12,r4
213    vrev64.8    d21,d5
214
215    vrev64.8    d22,d6
216    add         r9, r7, r3
217    vrev64.8    d23,d7
218
219    vrev64.8    d24,d8
220    add         r10,r0,#-2
221    vrev64.8    d25,d9
222
223    vrev64.8    d26,d10
224    subs        r1, r1, #8
225    vrev64.8    d27,d11
226
227    vrev64.8    d28,d12
228    vrev64.8    d29,d13
229
230    vrev64.8    d30,d14
231    add         r14, r9, r3
232    vrev64.8    d31,d15
233
234    bne         kernel_mode2
235
236epilogue_mode2:
237
238    vst2.8      {d16,d17},[r6],r5
239    vst2.8      {d18,d19},[r7],r5
240    vst2.8      {d20,d21},[r9],r5
241    vst2.8      {d22,d23},[r14],r5
242    vst2.8      {d24,d25},[r6],r5
243    vst2.8      {d26,d27},[r7],r5
244    vst2.8      {d28,d29},[r9],r5
245    vst2.8      {d30,d31},[r14],r5
246
247    b           end_func
248
249mode2_4:
250
251    lsl         r12,r4,#1
252    add         r0,r0,r12
253    sub         r0,r0,#2
254
255    vld2.8      {d12,d13},[r0],r8
256    vshl.i64    d0,d12,#32
257    add         r10,r0,#2
258    vshl.i64    d1,d13,#32
259
260    vrev64.8    d0,d0
261    vld2.8      {d14,d15},[r10],r8
262    vshl.i64    d2,d14,#32
263
264    vrev64.8    d1,d1
265    vshl.i64    d3,d15,#32
266    vzip.8      d0,d1
267    vst1.8      {d0},[r2],r3
268
269    vrev64.8    d2,d2
270    vld2.8      {d16,d17},[r0],r8
271    vshl.i64    d4,d16,#32
272    vrev64.8    d3,d3
273    vshl.i64    d5,d17,#32
274    vzip.8      d2,d3
275    vrev64.8    d4,d4
276    vrev64.8    d5,d5
277    vst1.8      {d2},[r2],r3
278
279
280    vld2.8      {d18,d19},[r10],r8
281    vshl.i64    d6,d18,#32
282
283    vzip.8      d4,d5
284    vshl.i64    d7,d19,#32
285    vrev64.8    d6,d6
286    vst1.8      {d4},[r2],r3
287
288    vrev64.8    d7,d7
289    vzip.8      d6,d7
290    vst1.8      {d6},[r2],r3
291
292end_func:
293    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
294
295
296
297
298
299
300