1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_luma_mode_18_34_neon.s
22//*
23//* @brief
24//*  contains function definitions for intra prediction dc filtering.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//*  yogeswaran rs
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//*    luma intraprediction filter for dc input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//*  uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//*  uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//*  integer source stride
56//*
57//* @param[in] dst_strd
58//*  integer destination stride
59//*
60//* @param[in] pi1_coeff
61//*  word8 pointer to the planar coefficients
62//*
63//* @param[in] nt
64//*  size of tranform block
65//*
66//* @param[in] mode
67//*  type of filtering
68//*
69//* @returns
70//*
71//* @remarks
72//*  none
73//*
74//*******************************************************************************
75//*/
76
77//void ihevc_intra_pred_luma_mode_18_34(uword8 *pu1_ref,
78//                                      word32 src_strd,
79//                                      uword8 *pu1_dst,
80//                                      word32 dst_strd,
81//                                      word32 nt,
82//                                      word32 mode)
83//
84//**************variables vs registers*****************************************
85//x0 => *pu1_ref
86//x1 => src_strd
87//x2 => *pu1_dst
88//x3 => dst_strd
89
90//stack contents from #40
91//    nt
92//    mode
93//    pi1_coeff
94
95.text
96.align 4
97.include "ihevc_neon_macros.s"
98
99
100
101.globl ihevc_intra_pred_luma_mode_18_34_av8
102
103.type ihevc_intra_pred_luma_mode_18_34_av8, %function
104
105ihevc_intra_pred_luma_mode_18_34_av8:
106
107    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
108    push_v_regs
109    stp         x19, x20,[sp,#-16]!
110
111    cmp         x4,#4
112    beq         mode2_4
113
114    mov         x11,x4
115    mov         x12,x4
116    sub         x14,x4,#8
117
118    add         x0,x0,x4,lsl #1
119
120    cmp         x5,#0x22
121    mov         x10,x2
122
123    add         x0,x0,#2
124    sub         x20,x0,#2
125    csel        x0, x20, x0,ne
126    mov         x20,#1
127    csel        x6, x20, x6,eq
128    mov         x20,#-1
129    csel        x6, x20, x6,ne
130    mov         x8,x0
131
132prologue_cpy_32:
133
134    ld1         {v0.8b},[x8],x6
135    lsr         x1, x4, #3
136    ld1         {v1.8b},[x8],x6
137    mul         x1, x4, x1
138    ld1         {v2.8b},[x8],x6
139    ld1         {v3.8b},[x8],x6
140    subs        x1,x1,#8
141    ld1         {v4.8b},[x8],x6
142    ld1         {v5.8b},[x8],x6
143    ld1         {v6.8b},[x8],x6
144
145    ld1         {v7.8b},[x8],x6
146
147
148    beq         epilogue_mode2
149    sub         x11,x11,#8
150
151    cmp         x5,#0x22
152    add         x20,x0,#8
153    csel        x0, x20, x0,ne
154    csel        x8, x0, x8,ne
155    bne         kernel_mode18
156    //add        x8,x0,#8
157
158kernel_mode2:
159    st1         {v0.8b},[x10],x3
160    st1         {v1.8b},[x10],x3
161    subs        x12,x12,#8
162    st1         {v2.8b},[x10],x3
163    add         x20,x2,#8
164    csel        x2, x20, x2,ne
165    st1         {v3.8b},[x10],x3
166
167    ld1         {v0.8b},[x8],x6
168    st1         {v4.8b},[x10],x3
169
170    st1         {v5.8b},[x10],x3
171    ld1         {v1.8b},[x8],x6
172    st1         {v6.8b},[x10],x3
173    ld1         {v2.8b},[x8],x6
174    st1         {v7.8b},[x10],x3
175
176    ld1         {v3.8b},[x8],x6
177    sub         x20,x10,x14
178    csel        x2, x20, x2,eq
179    ld1         {v4.8b},[x8],x6
180    mov         x10,x2
181    ld1         {v5.8b},[x8],x6
182    csel        x12, x4, x12,eq
183    ld1         {v6.8b},[x8],x6
184    subs        x11,x11,#8
185
186    ld1         {v7.8b},[x8],x6
187
188    add         x20,x0,#8
189    csel        x0, x20, x0,eq
190    csel        x11, x4, x11,eq
191    csel        x8, x0, x8,eq
192
193    subs        x1, x1, #8
194
195    bne         kernel_mode2
196
197    b           epilogue_mode2
198
199kernel_mode18:
200    st1         {v0.8b},[x10],x3
201    st1         {v1.8b},[x10],x3
202    subs        x12,x12,#8
203    st1         {v2.8b},[x10],x3
204    add         x20,x2,#8
205    csel        x2, x20, x2,ne
206    st1         {v3.8b},[x10],x3
207
208    ld1         {v0.8b},[x8],x6
209    st1         {v4.8b},[x10],x3
210
211    st1         {v5.8b},[x10],x3
212    ld1         {v1.8b},[x8],x6
213
214    st1         {v6.8b},[x10],x3
215    ld1         {v2.8b},[x8],x6
216    st1         {v7.8b},[x10],x3
217
218    ld1         {v3.8b},[x8],x6
219    sub         x20,x10,x14
220    csel        x2, x20, x2,eq
221    ld1         {v4.8b},[x8],x6
222    mov         x10,x2
223    ld1         {v5.8b},[x8],x6
224    csel        x12, x4, x12,eq
225    ld1         {v6.8b},[x8],x6
226    subs        x11,x11,#8
227    ld1         {v7.8b},[x8],x6
228
229    add         x20,x0,#8
230    csel        x0, x20, x0,ne
231    csel        x11, x4, x11,eq
232    sub         x20,x8,x14
233    csel        x0, x20, x0,eq
234    subs        x1, x1, #8
235    mov         x8,x0
236
237    bne         kernel_mode18
238
239
240epilogue_mode2:
241
242    st1         {v0.8b},[x10],x3
243    st1         {v1.8b},[x10],x3
244    st1         {v2.8b},[x10],x3
245    st1         {v3.8b},[x10],x3
246    st1         {v4.8b},[x10],x3
247    st1         {v5.8b},[x10],x3
248    st1         {v6.8b},[x10],x3
249    st1         {v7.8b},[x10],x3
250
251    b           end_func
252
253mode2_4:
254
255    add         x0,x0,#10
256    cmp         x5,#0x22
257    sub         x20,x0,#2
258    csel        x0, x20, x0,ne
259
260    mov         x20,#1
261    csel        x8, x20, x8,eq
262    mov         x20,#-1
263    csel        x8, x20, x8,ne
264
265    ld1         {v0.8b},[x0],x8
266    st1         {v0.s}[0],[x2],x3
267
268    ld1         {v0.8b},[x0],x8
269    st1         {v0.s}[0],[x2],x3
270
271    ld1         {v0.8b},[x0],x8
272    st1         {v0.s}[0],[x2],x3
273
274    ld1         {v0.8b},[x0],x8
275    st1         {v0.s}[0],[x2],x3
276
277end_func:
278    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
279    ldp         x19, x20,[sp],#16
280    pop_v_regs
281    ret
282
283
284
285
286
287
288
289