1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_filters_vert.s
22//*
23//* @brief
24//*  contains function definitions for intra prediction dc filtering.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//*  akshaya mukund
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//*    luma intraprediction filter for dc input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//*  uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//*  uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//*  integer source stride
56//*
57//* @param[in] dst_strd
58//*  integer destination stride
59//*
60//* @param[in] nt
61//*  size of tranform block
62//*
63//* @param[in] mode
64//*  type of filtering
65//*
66//* @returns
67//*
68//* @remarks
69//*  none
70//*
71//*******************************************************************************
72//*/
73
74//void ihevc_intra_pred_luma_ver(uword8* pu1_ref,
75//                               word32 src_strd,
76//                               uword8* pu1_dst,
77//                               word32 dst_strd,
78//                               word32 nt,
79//                               word32 mode)
80//
81//**************variables vs registers*****************************************
82//x0 => *pu1_ref
83//x1 => src_strd
84//x2 => *pu1_dst
85//x3 => dst_strd
86
87//stack contents from #40
88//    nt
89//    mode
90
91.text
92.align 4
93.include "ihevc_neon_macros.s"
94
95
96
97.globl ihevc_intra_pred_luma_ver_av8
98
99.type ihevc_intra_pred_luma_ver_av8, %function
100
101ihevc_intra_pred_luma_ver_av8:
102
103    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
104
105    stp         x19, x20,[sp,#-16]!
106
107    lsl         x5, x4, #1                  //2nt
108
109    cmp         x4, #16
110    beq         blk_16
111    blt         blk_4_8
112
113    add         x5, x5, #1                  //2nt+1
114    add         x6, x0, x5                  //&src[2nt+1]
115
116copy_32:
117    add         x5, x2, x3
118    ld1         {v20.8b, v21.8b}, [x6],#16  //16 loads (col 0:15)
119    add         x8, x5, x3
120
121    add         x10, x8, x3
122    ld1         {v22.8b, v23.8b}, [x6]      //16 loads (col 16:31)
123    lsl         x11, x3, #2
124
125    sub         x11, x11, #16
126    st1         {v20.8b, v21.8b}, [x2],#16
127    st1         {v20.8b, v21.8b}, [x5],#16
128    st1         {v20.8b, v21.8b}, [x8],#16
129    st1         {v20.8b, v21.8b}, [x10],#16
130
131    st1         {v22.8b, v23.8b}, [x2], x11
132    st1         {v22.8b, v23.8b}, [x5], x11
133    st1         {v22.8b, v23.8b}, [x8], x11
134    st1         {v22.8b, v23.8b}, [x10], x11
135
136    subs        x4, x4, #8
137
138kernel_copy_32:
139    st1         {v20.8b, v21.8b}, [x2],#16
140    st1         {v20.8b, v21.8b}, [x5],#16
141    st1         {v20.8b, v21.8b}, [x8],#16
142    st1         {v20.8b, v21.8b}, [x10],#16
143
144    st1         {v22.8b, v23.8b}, [x2], x11
145    st1         {v22.8b, v23.8b}, [x5], x11
146    st1         {v22.8b, v23.8b}, [x8], x11
147    st1         {v22.8b, v23.8b}, [x10], x11
148
149    subs        x4, x4, #8
150
151    st1         {v20.8b, v21.8b}, [x2],#16
152    st1         {v20.8b, v21.8b}, [x5],#16
153    st1         {v20.8b, v21.8b}, [x8],#16
154    st1         {v20.8b, v21.8b}, [x10],#16
155
156    st1         {v22.8b, v23.8b}, [x2], x11
157    st1         {v22.8b, v23.8b}, [x5], x11
158    st1         {v22.8b, v23.8b}, [x8], x11
159    st1         {v22.8b, v23.8b}, [x10], x11
160
161    bne         kernel_copy_32
162
163    st1         {v20.8b, v21.8b}, [x2],#16
164    st1         {v20.8b, v21.8b}, [x5],#16
165    st1         {v20.8b, v21.8b}, [x8],#16
166    st1         {v20.8b, v21.8b}, [x10],#16
167
168    st1         {v22.8b, v23.8b}, [x2], x11
169    st1         {v22.8b, v23.8b}, [x5], x11
170    st1         {v22.8b, v23.8b}, [x8], x11
171    st1         {v22.8b, v23.8b}, [x10], x11
172
173    b           end_func
174
175blk_16:
176    add         x6, x0, x5                  //&src[2nt]
177
178    ldrb        w11, [x6], #1               //src[2nt]
179    sxtw        x11,w11
180
181    dup         v22.16b,w11                 //src[2nt]
182    ldrb        w12, [x6]                   //src[2nt+1]
183    sxtw        x12,w12
184
185    ld1         {v16.8b, v17.8b}, [x6]      //ld for repl to cols src[2nt+1+col(0:15)] (0 ignored for stores)
186    sub         x6, x6, #17                 //subtract -9 to take it to src[2nt-1-row(15)]
187
188    dup         v24.16b,w12                 //src[2nt+1]
189    dup         v30.8h,w12
190    lsl         x5, x3, #3                  //8*stride
191
192    ld1         {v26.16b}, [x6],#16         //load src[2nt-1-row](rows 0:15)
193    add         x5, x2, x5                  //x5 ->
194
195    movi        d18, #0x00000000000000ff
196    uhsub       v26.16b,  v26.16b ,  v22.16b //(src[2nt-1-row] - src[2nt])>>1
197    //vsubl.u8    q0, d26, d22
198    //vsubl.u8    q14, d27, d22
199
200    //vshr.s16    q0, q0, #1
201    //vshr.s16    q14, q14, #1
202
203    mov         v19.d[0],v17.d[0]
204    //vaddl.s8    q0, d24, d26
205    sxtl        v0.8h, v26.8b
206    sxtl2       v28.8h, v26.16b
207    sqadd       v0.8h,  v0.8h ,  v30.8h
208    sqadd       v28.8h,  v28.8h ,  v30.8h
209
210    movi        d3, #0x00000000000000ff
211    //vaddl.s8    q1, d25, d27
212
213    sqxtun      v24.8b, v28.8h
214    sqxtun2     v24.16b, v0.8h
215    //vmovn.u16    d25, q0
216    //vmovn.u16    d24, q1
217
218    rev64       v24.16b,  v24.16b
219    mov         v25.d[0], v24.d[1]
220
221    mov         v4.d[0],v17.d[0]
222
223    bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
224    bsl         v3.8b,  v25.8b ,  v16.8b
225
226    movi        d1, #0x00000000000000ff
227    mov         v2.d[0],v17.d[0]
228
229    movi        d6, #0x00000000000000ff
230    mov         v7.d[0],v17.d[0]
231
232    st1         {v18.8b, v19.8b}, [x2], x3
233    sshr        d24, d24,#8
234
235    st1         {v3.8b, v4.8b}, [x5], x3
236    sshr        d25, d25,#8
237
238
239    bsl         v1.8b,  v24.8b ,  v16.8b
240    bsl         v6.8b,  v25.8b ,  v16.8b
241
242    st1         {v1.8b, v2.8b}, [x2], x3
243    sshr        d24, d24,#8
244
245    st1         {v6.8b, v7.8b}, [x5], x3
246    sshr        d25, d25,#8
247
248    subs        x4, x4,#8
249
250    movi        d18, #0x00000000000000ff
251    //vmov.i64    d19, d17
252
253    movi        d3, #0x00000000000000ff
254    //vmov.i64    d11, d17
255
256
257loop_16:
258
259
260    movi        d1, #0x00000000000000ff
261
262    movi        d6, #0x00000000000000ff
263
264    bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
265    bsl         v3.8b,  v25.8b ,  v16.8b
266
267    st1         {v18.8b, v19.8b}, [x2], x3
268    sshr        d24, d24,#8
269
270    st1         {v3.8b, v4.8b}, [x5], x3
271    sshr        d25, d25,#8
272
273    movi        d18, #0x00000000000000ff
274
275    movi        d3, #0x00000000000000ff
276
277    bsl         v1.8b,  v24.8b ,  v16.8b
278    bsl         v6.8b,  v25.8b ,  v16.8b
279
280    st1         {v1.8b, v2.8b}, [x2], x3
281    sshr        d24, d24,#8
282
283    st1         {v6.8b, v7.8b}, [x5], x3
284    sshr        d25, d25,#8
285
286    subs        x4, x4, #4
287
288    bne         loop_16
289
290    movi        d1, #0x00000000000000ff
291
292    movi        d6, #0x00000000000000ff
293
294    bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
295    bsl         v3.8b,  v25.8b ,  v16.8b
296
297    st1         {v18.8b, v19.8b}, [x2], x3
298    sshr        d24, d24,#8
299
300    st1         {v3.8b, v4.8b}, [x5], x3
301    sshr        d25, d25,#8
302
303    bsl         v1.8b,  v24.8b ,  v16.8b
304    bsl         v6.8b,  v25.8b ,  v16.8b
305
306    st1         {v1.8b, v2.8b}, [x2], x3
307
308    st1         {v6.8b, v7.8b}, [x5], x3
309
310    b           end_func
311
312
313blk_4_8:
314    movi        d4, #0x00000000000000ff
315    add         x6, x0, x5                  //&src[2nt]
316
317    movi        d3, #0x00000000000000ff
318    ldrb        w11, [x6], #1               //src[2nt]
319    sxtw        x11,w11
320
321    dup         v22.8b,w11                  //src[2nt]
322    ldrb        w12, [x6]                   //src[2nt+1]
323    sxtw        x12,w12
324
325    ld1         {v16.8b},[x6]               //ld for repl to cols src[2nt+1+col(0:3 or 0:7)](0 ignored for st)
326    sub         x6, x6, #9                  //subtract -9 to take it to src[2nt-1-row(15)]
327
328    dup         v24.8b,w12                  //src[2nt+1]
329    dup         v30.8h,w12
330
331    ld1         {v26.8b},[x6],#8            //load src[2nt-1-row](rows 0:15)
332
333    movi        d18, #0x00000000000000ff
334    uhsub       v26.8b,  v26.8b ,  v22.8b   //(src[2nt-1-row] - src[2nt])>>1
335    //vsubl.u8    q13, d26, d22
336
337    //vshr.s16    q13, q13, #1
338
339    movi        d19, #0x00000000000000ff
340    sxtl        v26.8h, v26.8b
341    //vaddl.s8    q0, d24, d26
342    sqadd       v0.8h,  v26.8h ,  v30.8h
343
344    sqxtun      v24.8b, v0.8h
345    //vmovn.s16    d24, q0
346
347    rev64       v24.8b,  v24.8b
348
349    cmp         x4, #4
350    beq         blk_4
351
352    bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
353
354    st1         {v18.8b},[x2], x3
355    sshr        d24, d24,#8
356
357    movi        d18, #0x00000000000000ff
358
359    bsl         v19.8b,  v24.8b ,  v16.8b
360
361    st1         {v19.8b},[x2], x3
362    sshr        d24, d24,#8
363
364    movi        d19, #0x00000000000000ff
365
366    bsl         v3.8b,  v24.8b ,  v16.8b
367
368    st1         {v3.8b},[x2], x3
369    sshr        d24, d24,#8
370
371    movi        d3, #0x00000000000000ff
372
373    bsl         v4.8b,  v24.8b ,  v16.8b
374
375    st1         {v4.8b},[x2], x3
376    sshr        d24, d24,#8
377
378    movi        d4, #0x00000000000000ff
379
380    bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
381
382    st1         {v18.8b},[x2], x3
383    sshr        d24, d24,#8
384
385    bsl         v19.8b,  v24.8b ,  v16.8b
386
387    st1         {v19.8b},[x2], x3
388    sshr        d24, d24,#8
389
390    bsl         v3.8b,  v24.8b ,  v16.8b
391
392    st1         {v3.8b},[x2], x3
393    sshr        d24, d24,#8
394
395    bsl         v4.8b,  v24.8b ,  v16.8b
396
397    st1         {v4.8b},[x2], x3
398    sshr        d24, d24,#8
399
400    b           end_func
401
402
403blk_4:
404    bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
405
406    st1         {v18.s}[0],[x2], x3
407    sshr        d24, d24,#8
408
409    bsl         v19.8b,  v24.8b ,  v16.8b
410
411    st1         {v19.s}[0],[x2], x3
412    sshr        d24, d24,#8
413
414    bsl         v3.8b,  v24.8b ,  v16.8b
415
416    st1         {v3.s}[0],[x2], x3
417    sshr        d24, d24,#8
418
419    bsl         v4.8b,  v24.8b ,  v16.8b
420    st1         {v4.s}[0],[x2], x3
421
422
423end_func:
424    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
425    ldp         x19, x20,[sp],#16
426
427    ret
428
429
430
431
432
433