1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_luma_horz_neon.s
22//*
23//* @brief
24//*  contains function definition for intra prediction  interpolation filters
25//*
26//*
27//* @author
28//*  parthiban v
29//*
30//* @par list of functions:
31//*  - ihevc_intra_pred_luma_horz()
32//*
33//* @remarks
34//*  none
35//*
36//*******************************************************************************
37//*/
38//
39///**
40//*******************************************************************************
41//*
42//* @brief
43//*     intra prediction interpolation filter for horizontal luma variable.
44//*
45//* @par description:
46//*      horizontal intraprediction(mode 10) with.extern  samples location
47//*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
48//*      to section 8.4.4.2.6 in the standard (special case)
49//*
50//* @param[in] pu1_src
51//*  uword8 pointer to the source
52//*
53//* @param[out] pu1_dst
54//*  uword8 pointer to the destination
55//*
56//* @param[in] src_strd
57//*  integer source stride
58//*
59//* @param[in] dst_strd
60//*  integer destination stride
61//*
62//* @param[in] nt
63//*  integer transform block size
64//*
65//* @param[in] mode
66//*  integer intraprediction mode
67//*
68//* @returns
69//*
70//* @remarks
71//*  none
72//*
73//*******************************************************************************
74//*/
75//void ihevc_intra_pred_luma_horz(uword8 *pu1_ref,
76//                                word32 src_strd,
77//                                uword8 *pu1_dst,
78//                                word32 dst_strd,
79//                                word32 nt,
80//                                word32 mode)
81//**************variables vs registers*****************************************
82//x0 => *pu1_ref
83//x1 =>  src_strd
84//x2 => *pu1_dst
85//x3 =>  dst_strd
86
87.text
88.align 4
89.include "ihevc_neon_macros.s"
90
91
92
93.globl ihevc_intra_pred_luma_horz_av8
94
95.type ihevc_intra_pred_luma_horz_av8, %function
96
97ihevc_intra_pred_luma_horz_av8:
98
99    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
100
101    stp         x19, x20,[sp,#-16]!
102
103    //ldr          x5,[sp,#44]                        @loads mode
104
105    lsl         x6,x4,#1                    //two_nt
106
107    add         x12,x0,x6                   //*pu1_ref[two_nt]
108    cmp         x4,#4                       //if nt == 4
109    beq         core_loop_4
110
111    cmp         x4,#8                       //if nt == 8
112    beq         core_loop_8
113
114    cmp         x4,#16                      //if nt == 16
115    beq         core_loop_16
116    sub         x12,x12,#16                 //move to 16th value pointer
117    add         x9,x2,#16
118
119core_loop_32:
120    ld1         { v0.16b},[x12]             //load 16 values. d1[7] will have the 1st value.
121
122    dup         v2.16b, v0.b[15]            //duplicate the i value.
123
124    dup         v4.16b, v0.b[14]            //duplicate the ii value.
125    dup         v6.16b, v0.b[13]            //duplicate the iii value.
126    st1         { v2.16b},[x2],x3           //store in 1st row 0-16 columns
127    st1         { v2.16b},[x9],x3           //store in 1st row 16-32 columns
128
129    dup         v1.16b, v0.b[12]
130    st1         { v4.16b},[x2],x3
131    st1         { v4.16b},[x9],x3
132
133    dup         v2.16b, v0.b[11]
134    st1         { v6.16b},[x2],x3
135    st1         { v6.16b},[x9],x3
136
137    dup         v4.16b, v0.b[10]
138    st1         { v1.16b},[x2],x3
139    st1         { v1.16b},[x9],x3
140
141    dup         v6.16b, v0.b[9]
142    st1         { v2.16b},[x2],x3
143    st1         { v2.16b},[x9],x3
144
145    dup         v1.16b, v0.b[8]
146    st1         { v4.16b},[x2],x3
147    st1         { v4.16b},[x9],x3
148
149    dup         v2.16b, v0.b[7]
150    st1         { v6.16b},[x2],x3
151    st1         { v6.16b},[x9],x3
152
153    dup         v4.16b, v0.b[6]
154    st1         { v1.16b},[x2],x3
155    st1         { v1.16b},[x9],x3
156
157    dup         v6.16b, v0.b[5]
158    st1         { v2.16b},[x2],x3
159    st1         { v2.16b},[x9],x3
160
161    dup         v1.16b, v0.b[4]
162    st1         { v4.16b},[x2],x3
163    st1         { v4.16b},[x9],x3
164
165    dup         v2.16b, v0.b[3]
166    st1         { v6.16b},[x2],x3
167    st1         { v6.16b},[x9],x3
168
169    dup         v4.16b, v0.b[2]
170    st1         { v1.16b},[x2],x3
171    st1         { v1.16b},[x9],x3
172
173    dup         v6.16b, v0.b[1]
174    st1         { v2.16b},[x2],x3
175    st1         { v2.16b},[x9],x3
176    sub         x12,x12,#16                 //move to 16th value pointer
177
178    dup         v1.16b, v0.b[0]
179    st1         { v4.16b},[x2],x3
180    st1         { v4.16b},[x9],x3
181
182    subs        x4,x4,#16                   //decrement the loop count by 16
183    st1         { v6.16b},[x2],x3
184    st1         { v6.16b},[x9],x3
185
186    st1         { v1.16b},[x2],x3
187    st1         { v1.16b},[x9],x3
188    bgt         core_loop_32
189    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
190    ldp         x19, x20,[sp],#16
191
192    ret
193    b           end_func
194
195core_loop_16:
196    ldrb        w14,[x12],#1                //pu1_ref[two_nt]
197    sxtw        x14,w14
198    ld1         { v30.8b},[x12],#8          //pu1_ref[two_nt + 1 + col]
199    ld1         { v31.8b},[x12]             //pu1_ref[two_nt + 1 + col]
200    sub         x12,x12,#8
201
202    dup         v28.8b,w14
203    sub         x12,x12,#17
204    ld1         { v0.16b},[x12]
205    dup         v26.8b, v0.b[15]
206    uxtl        v26.8h, v26.8b
207
208    dup         v2.16b, v0.b[14]
209    usubl       v24.8h, v30.8b, v28.8b
210
211    dup         v4.16b, v0.b[13]
212    sshr        v24.8h, v24.8h,#1
213
214    dup         v6.16b, v0.b[12]
215    sqadd       v22.8h,  v26.8h ,  v24.8h
216
217    dup         v1.16b, v0.b[11]
218    sqxtun      v22.8b, v22.8h
219
220    st1         {v22.8b},[x2],#8
221
222    dup         v18.16b, v0.b[10]
223    usubl       v24.8h, v31.8b, v28.8b
224
225    dup         v19.16b, v0.b[9]
226    sshr        v24.8h, v24.8h,#1
227
228    dup         v20.16b, v0.b[8]
229    sqadd       v22.8h,  v26.8h ,  v24.8h
230
231    dup         v16.16b, v0.b[7]
232    sqxtun      v22.8b, v22.8h
233
234    st1         {v22.8b},[x2],x3
235    sub         x2,x2,#8
236
237    st1         { v2.16b},[x2],x3
238
239    st1         { v4.16b},[x2],x3
240    st1         { v6.16b},[x2],x3
241    st1         { v1.16b},[x2],x3
242
243    dup         v2.16b, v0.b[6]
244    st1         { v18.16b},[x2],x3
245
246    dup         v4.16b, v0.b[5]
247    st1         { v19.16b},[x2],x3
248
249    dup         v6.16b, v0.b[4]
250    st1         { v20.16b},[x2],x3
251
252    dup         v1.16b, v0.b[3]
253    st1         { v16.16b},[x2],x3
254
255    dup         v18.16b, v0.b[2]
256    st1         { v2.16b},[x2],x3
257
258    dup         v19.16b, v0.b[1]
259    st1         { v4.16b},[x2],x3
260
261    dup         v20.16b, v0.b[0]
262    st1         { v6.16b},[x2],x3
263
264    st1         { v1.16b},[x2],x3
265    st1         { v18.16b},[x2],x3
266    st1         { v19.16b},[x2],x3
267    st1         { v20.16b},[x2],x3
268
269    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
270    ldp         x19, x20,[sp],#16
271
272    ret
273    b           end_func
274
275
276core_loop_8:
277    ldrb        w14,[x12]                   //pu1_ref[two_nt]
278    sxtw        x14,w14
279    add         x12,x12,#1                  //pu1_ref[two_nt + 1]
280    ld1         {v30.8b},[x12]              //pu1_ref[two_nt + 1 + col]
281
282    sub         x12,x12,#9
283    ld1         {v0.8b},[x12]
284    dup         v26.8b, v0.b[7]
285    dup         v28.8b,w14
286
287    dup         v3.8b, v0.b[6]
288    uxtl        v26.8h, v26.8b
289
290    dup         v4.8b, v0.b[5]
291    usubl       v24.8h, v30.8b, v28.8b
292
293    dup         v5.8b, v0.b[4]
294    sshr        v24.8h, v24.8h,#1
295
296    dup         v6.8b, v0.b[3]
297    sqadd       v22.8h,  v26.8h ,  v24.8h
298
299    dup         v7.8b, v0.b[2]
300    sqxtun      v22.8b, v22.8h
301
302    st1         {v22.8b},[x2],x3
303    st1         {v3.8b},[x2],x3
304
305    dup         v1.8b, v0.b[1]
306    st1         {v4.8b},[x2],x3
307    st1         {v5.8b},[x2],x3
308
309    dup         v17.8b, v0.b[0]
310    st1         {v6.8b},[x2],x3
311    st1         {v7.8b},[x2],x3
312
313    st1         {v1.8b},[x2],x3
314    st1         {v17.8b},[x2],x3
315    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
316    ldp         x19, x20,[sp],#16
317
318    ret
319    b           end_func
320
321
322core_loop_4:
323    ldrb        w14,[x12]                   //pu1_ref[two_nt]
324    sxtw        x14,w14
325    add         x12,x12,#1                  //pu1_ref[two_nt + 1]
326    ld1         {v30.8b},[x12]              //pu1_ref[two_nt + 1 + col]
327
328    sub         x12,x12,#5
329    ld1         {v0.8b},[x12]
330    dup         v28.8b,w14
331    dup         v26.8b, v0.b[3]
332    uxtl        v26.8h, v26.8b
333
334    dup         v3.8b, v0.b[2]
335    usubl       v24.8h, v30.8b, v28.8b
336
337    dup         v4.8b, v0.b[1]
338    sshr        v24.8h, v24.8h,#1
339
340    dup         v5.8b, v0.b[0]
341    sqadd       v22.8h,  v26.8h ,  v24.8h
342
343    sqxtun      v22.8b, v22.8h
344
345    st1         {v22.s}[0],[x2],x3
346    st1         {v3.s}[0],[x2],x3
347    st1         {v4.s}[0],[x2],x3
348    st1         {v5.s}[0],[x2],x3
349
350    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
351    ldp         x19, x20,[sp],#16
352
353    ret
354end_func:
355
356
357
358