1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_luma_mode2_neon.s
22//*
23//* @brief
24//*  contains function definitions for intra prediction dc filtering.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//*  yogeswaran rs
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//*    luma intraprediction filter for dc input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//*  uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//*  uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//*  integer source stride
56//*
57//* @param[in] dst_strd
58//*  integer destination stride
59//*
60//* @param[in] pi1_coeff
61//*  word8 pointer to the planar coefficients
62//*
63//* @param[in] nt
64//*  size of tranform block
65//*
66//* @param[in] mode
67//*  type of filtering
68//*
69//* @returns
70//*
71//* @remarks
72//*  none
73//*
74//*******************************************************************************
75//*/
76
77//void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
78//                                 word32 src_strd,
79//                                 uword8 *pu1_dst,
80//                                 word32 dst_strd,
81//                                 word32 nt,
82//                                 word32 mode)
83//
84//**************variables vs registers*****************************************
85//x0 => *pu1_ref
86//x1 => src_strd
87//x2 => *pu1_dst
88//x3 => dst_strd
89
90//stack contents from #40
91//    nt
92//    mode
93//    pi1_coeff
94
95.text
96.align 4
97.include "ihevc_neon_macros.s"
98
99
100
101.globl ihevc_intra_pred_chroma_mode2_av8
102
103.type ihevc_intra_pred_chroma_mode2_av8, %function
104
105ihevc_intra_pred_chroma_mode2_av8:
106
107    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
108    push_v_regs
109    stp         x19, x20,[sp,#-16]!
110
111    mov         x8,#-4
112
113    cmp         x4,#4
114    beq         mode2_4
115
116    add         x0,x0,x4,lsl #2
117
118    sub         x0,x0,#0x12                 //src[1]
119    add         x10,x0,#-2
120
121prologue_cpy_32:
122
123    ld2         {v0.8b, v1.8b},[x0],x8
124
125    mov         x11,x4
126    rev64       v16.8b,  v0.8b
127    rev64       v17.8b,  v1.8b
128
129    ld2         {v2.8b, v3.8b},[x10],x8
130    mov         x6, x2
131
132    ld2         {v4.8b, v5.8b},[x0],x8
133    ld2         {v6.8b, v7.8b},[x10],x8
134    lsr         x1, x4, #3
135
136    ld2         {v8.8b, v9.8b},[x0],x8
137    ld2         {v10.8b, v11.8b},[x10],x8
138    ld2         {v12.8b, v13.8b},[x0],x8
139    mul         x1, x4, x1
140
141    ld2         {v14.8b, v15.8b},[x10],x8
142    add         x7,x6,x3
143
144    rev64       v18.8b,  v2.8b
145    rev64       v19.8b,  v3.8b
146    lsl         x5, x3, #2
147
148    rev64       v20.8b,  v4.8b
149    rev64       v21.8b,  v5.8b
150    add         x9,x7,x3
151
152    rev64       v22.8b,  v6.8b
153    rev64       v23.8b,  v7.8b
154
155    rev64       v24.8b,  v8.8b
156    rev64       v25.8b,  v9.8b
157
158    rev64       v26.8b,  v10.8b
159    subs        x1,x1,#8
160
161    rev64       v27.8b,  v11.8b
162
163    rev64       v28.8b,  v12.8b
164    rev64       v29.8b,  v13.8b
165
166    rev64       v30.8b,  v14.8b
167    add         x14,x9,x3
168    rev64       v31.8b,  v15.8b
169
170    beq         epilogue_mode2
171
172    sub         x12,x4,#8
173
174kernel_mode2:
175
176    st2         {v16.8b, v17.8b},[x6],x5
177    st2         {v18.8b, v19.8b},[x7],x5
178    subs        x11,x11,#8
179    st2         {v20.8b, v21.8b},[x9],x5
180    st2         {v22.8b, v23.8b},[x14],x5
181    st2         {v24.8b, v25.8b},[x6],x5
182    add         x20,x2,#16
183    csel        x2, x20, x2,gt
184    st2         {v26.8b, v27.8b},[x7],x5
185    st2         {v28.8b, v29.8b},[x9],x5
186    st2         {v30.8b, v31.8b},[x14],x5
187
188    ld2         {v0.8b, v1.8b},[x0],x8
189    csel        x11, x4, x11,le
190
191    ld2         {v2.8b, v3.8b},[x10],x8
192    ld2         {v4.8b, v5.8b},[x0],x8
193    add         x20, x2, x3, lsl #2
194    csel        x2, x20, x2,le
195    ld2         {v6.8b, v7.8b},[x10],x8
196    rev64       v16.8b,  v0.8b
197
198    ld2         {v8.8b, v9.8b},[x0],x8
199    ld2         {v10.8b, v11.8b},[x10],x8
200    sub         x20, x6,#16
201    csel        x2, x20, x2,le
202    ld2         {v12.8b, v13.8b},[x0],x8
203    rev64       v17.8b,  v1.8b
204    ld2         {v14.8b, v15.8b},[x10],x8
205
206    subs        x12,x12,#8
207    mov         x6, x2
208    add         x20, x0, x4,lsl #1
209    csel        x0, x20, x0,le
210    add         x7, x6, x3
211
212    rev64       v18.8b,  v2.8b
213    sub         x20, x0, #16
214    csel        x0, x20, x0,le
215    rev64       v19.8b,  v3.8b
216
217    rev64       v20.8b,  v4.8b
218    csel        x12, x4, x12,le
219    rev64       v21.8b,  v5.8b
220
221    rev64       v22.8b,  v6.8b
222    add         x9, x7, x3
223    rev64       v23.8b,  v7.8b
224
225    rev64       v24.8b,  v8.8b
226    add         x10,x0,#-2
227    rev64       v25.8b,  v9.8b
228
229    rev64       v26.8b,  v10.8b
230    subs        x1, x1, #8
231    rev64       v27.8b,  v11.8b
232
233    rev64       v28.8b,  v12.8b
234    rev64       v29.8b,  v13.8b
235
236    rev64       v30.8b,  v14.8b
237    add         x14, x9, x3
238    rev64       v31.8b,  v15.8b
239
240    bne         kernel_mode2
241
242epilogue_mode2:
243
244    st2         {v16.8b, v17.8b},[x6],x5
245    st2         {v18.8b, v19.8b},[x7],x5
246    st2         {v20.8b, v21.8b},[x9],x5
247    st2         {v22.8b, v23.8b},[x14],x5
248    st2         {v24.8b, v25.8b},[x6],x5
249    st2         {v26.8b, v27.8b},[x7],x5
250    st2         {v28.8b, v29.8b},[x9],x5
251    st2         {v30.8b, v31.8b},[x14],x5
252
253    b           end_func
254
255mode2_4:
256
257    lsl         x12,x4,#1
258    add         x0,x0,x12
259    sub         x0,x0,#2
260
261    ld2         {v12.8b, v13.8b},[x0],x8
262    shl         d0, d12,#32
263    add         x10,x0,#2
264    shl         d1, d13,#32
265
266    rev64       v0.8b,  v0.8b
267    ld2         {v14.8b, v15.8b},[x10],x8
268    shl         d2, d14,#32
269
270    rev64       v1.8b,  v1.8b
271    shl         d3, d15,#32
272    zip1        v0.8b, v0.8b, v1.8b
273    zip2        v1.8b, v0.8b, v1.8b
274    st1         {v0.8b},[x2],x3
275
276    rev64       v2.8b,  v2.8b
277    ld2         {v16.8b, v17.8b},[x0],x8
278    shl         d4, d16,#32
279    rev64       v3.8b,  v3.8b
280    shl         d5, d17,#32
281    zip1        v2.8b, v2.8b, v3.8b
282    zip2        v3.8b, v2.8b, v3.8b
283    rev64       v4.8b,  v4.8b
284    rev64       v5.8b,  v5.8b
285    st1         {v2.8b},[x2],x3
286
287
288    ld2         {v18.8b, v19.8b},[x10],x8
289    shl         d6, d18,#32
290
291    zip1        v4.8b, v4.8b, v5.8b
292    zip2        v5.8b, v4.8b, v5.8b
293    shl         d7, d19,#32
294    rev64       v6.8b,  v6.8b
295    st1         {v4.8b},[x2],x3
296
297    rev64       v7.8b,  v7.8b
298    zip1        v6.8b, v6.8b, v7.8b
299    zip2        v7.8b, v6.8b, v7.8b
300    st1         {v6.8b},[x2],x3
301
302end_func:
303    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
304    ldp         x19, x20,[sp],#16
305    pop_v_regs
306    ret
307
308
309
310
311
312
313