1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//*  ih264_inter_pred_chroma_av8.s
24//*
25//* @brief
26//*  Contains function definitions for inter prediction  interpolation.
27//*
28//* @author
29//*  Ittaim
30//*
31//* @par List of Functions:
32//*
33//*  - ih264_inter_pred_chroma_av8()
34//*
35//* @remarks
36//*  None
37//*
38//*******************************************************************************
39//*/
40
41///* All the functions here are replicated from ih264_inter_pred_filters.c
42//
43
44///**
45///**
46///**
47//
48///**
49//*******************************************************************************
50//*
51//* @brief
52//*    Interprediction chroma filter
53//*
54//* @par Description:
55//*   Applies filtering to chroma samples as mentioned in
56//*    sec 8.4.2.2.2 titled "chroma sample interpolation process"
57//*
58//* @param[in] pu1_src
59//*  UWORD8 pointer to the source containing alternate U and V samples
60//*
61//* @param[out] pu1_dst
62//*  UWORD8 pointer to the destination
63//*
64//* @param[in] src_strd
65//*  integer source stride
66//*
67//* @param[in] dst_strd
68//*  integer destination stride
69//*
70//* @param[in]uc_dx
71//*  dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
72//*
73//* @param[in] uc_dy
74//*  dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
75//*
76//* @param[in] ht
77//*  integer height of the array
78//*
79//* @param[in] wd
80//*  integer width of the array
81//*
82//* @returns
83//*
84//* @remarks
85//*  None
86//*
87//*******************************************************************************
88//*/
89
90//void ih264_inter_pred_chroma(UWORD8 *pu1_src,
91//                             UWORD8 *pu1_dst,
92//                             WORD32 src_strd,
93//                             WORD32 dst_strd,
94//                             UWORD8 u1_dx,
95//                             UWORD8 u1_dy,
96//                             WORD32 ht,
97//                             WORD32 wd)
98//**************Variables Vs Registers*****************************************
99//    x0 => *pu1_src
100//    x1 => *pu1_dst
101//    x2 =>  src_strd
102//    x3 =>  dst_strd
103//   x4 =>  u1_dx
104//   x5 =>  u1_dy
105//    x6 =>  height
106//    x7 => width
107//
108.text
109.p2align 2
110.include "ih264_neon_macros.s"
111
112
113
114    .global ih264_inter_pred_chroma_av8
115
116ih264_inter_pred_chroma_av8:
117
118
119
120    // STMFD sp!, {x4-x12, x14}          //store register values to stack
121    push_v_regs
122    stp       x19, x20, [sp, #-16]!
123
124
125
126
127
128    sub       x20, x4, #8               //8-u1_dx
129    neg       x8, x20
130    sub       x20, x5, #8               //8-u1_dy
131    neg       x9, x20
132    mul       x10, x8, x9               //
133    mul       x11, x4, x9               //
134
135    dup       v28.8b, w10
136    dup       v29.8b, w11
137
138    mul       x10, x8, x5               //
139    mul       x11, x4, x5               //
140
141    dup       v30.8b, w10
142    dup       v31.8b, w11
143
144    subs      x12, x7, #2               //if wd=4 branch to loop_4
145    beq       loop_2
146    subs      x12, x7, #4               //if wd=8 branch to loop_8
147    beq       loop_4
148
149loop_8:
150    ld1       {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row0 ;
151    ext       v3.8b, v0.8b , v1.8b , #2
152    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1;
153    umull     v20.8h, v0.8b, v28.8b
154    ext       v8.8b, v5.8b , v6.8b , #2
155    umlal     v20.8h, v3.8b, v29.8b
156    ext       v9.8b, v6.8b , v7.8b , #2
157    umlal     v20.8h, v5.8b, v30.8b
158    ext       v4.8b, v1.8b , v2.8b , #2
159    umlal     v20.8h, v8.8b, v31.8b
160    sqrshrun  v26.8b, v20.8h, #6
161    umull     v22.8h, v1.8b, v28.8b
162    ld1       {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row2 ;
163    umlal     v22.8h, v4.8b, v29.8b
164    ext       v13.8b, v10.8b , v11.8b , #2
165    umlal     v22.8h, v6.8b, v30.8b
166    ext       v14.8b, v11.8b , v12.8b , #2
167    umlal     v22.8h, v9.8b, v31.8b
168    sqrshrun  v27.8b, v22.8h, #6
169    umull     v24.8h, v5.8b, v28.8b
170    st1       { v26.8b, v27.8b}, [x1], x3 ////Store dest row
171    umlal     v24.8h, v8.8b, v29.8b
172    ld1       {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row3 ;
173    umlal     v24.8h, v10.8b, v30.8b
174    ext       v3.8b, v0.8b , v1.8b , #2
175    umlal     v24.8h, v13.8b, v31.8b
176    ext       v4.8b, v1.8b , v2.8b , #2
177    umull     v16.8h, v6.8b, v28.8b
178    sqrshrun  v18.8b, v24.8h, #6
179    umlal     v16.8h, v9.8b, v29.8b
180    umlal     v16.8h, v11.8b, v30.8b
181    umlal     v16.8h, v14.8b, v31.8b
182    sqrshrun  v19.8b, v16.8h, #6
183    st1       {v18.8b, v19.8b}, [x1], x3 // store row 1
184    umull     v20.8h, v10.8b, v28.8b
185    umlal     v20.8h, v13.8b, v29.8b
186    umlal     v20.8h, v0.8b, v30.8b
187    umlal     v20.8h, v3.8b, v31.8b
188    sqrshrun  v26.8b, v20.8h, #6
189    umull     v24.8h, v11.8b, v28.8b
190    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row4;
191    umlal     v24.8h, v14.8b, v29.8b
192    ext       v8.8b, v5.8b , v6.8b , #2
193    umlal     v24.8h, v1.8b, v30.8b
194    ext       v9.8b, v6.8b , v7.8b , #2
195    umlal     v24.8h, v4.8b, v31.8b
196    umull     v20.8h, v0.8b, v28.8b
197    sqrshrun  v27.8b, v24.8h, #6
198    umlal     v20.8h, v3.8b, v29.8b
199    st1       { v26.8b, v27.8b}, [x1], x3 ////Store dest row2
200    umlal     v20.8h, v5.8b, v30.8b
201    umlal     v20.8h, v8.8b, v31.8b
202    umull     v22.8h, v1.8b, v28.8b
203    umlal     v22.8h, v4.8b, v29.8b
204    umlal     v22.8h, v6.8b, v30.8b
205    sqrshrun  v26.8b, v20.8h, #6
206    umlal     v22.8h, v9.8b, v31.8b
207    subs      x12, x6, #4
208    sqrshrun  v27.8b, v22.8h, #6
209    st1       { v26.8b, v27.8b}, [x1], x3 ////Store dest row3
210
211    beq       end_func                  //If ht=4
212
213    ld1       {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row5
214    ext       v13.8b, v10.8b , v11.8b , #2
215    umull     v24.8h, v5.8b, v28.8b
216    ext       v14.8b, v11.8b , v12.8b , #2
217    ld1       {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row6;
218    umlal     v24.8h, v8.8b, v29.8b
219    umlal     v24.8h, v10.8b, v30.8b
220    umlal     v24.8h, v13.8b, v31.8b
221    ext       v3.8b, v0.8b , v1.8b , #2
222    umull     v16.8h, v6.8b, v28.8b
223    sqrshrun  v18.8b, v24.8h, #6
224    umlal     v16.8h, v9.8b, v29.8b
225    umlal     v16.8h, v11.8b, v30.8b
226    umlal     v16.8h, v14.8b, v31.8b
227    ext       v4.8b, v1.8b , v2.8b , #2
228    sqrshrun  v19.8b, v16.8h, #6
229    st1       { v18.8b, v19.8b}, [x1], x3 // store row 4
230    umull     v20.8h, v10.8b, v28.8b
231    umlal     v20.8h, v13.8b, v29.8b
232    umlal     v20.8h, v0.8b, v30.8b
233    umlal     v20.8h, v3.8b, v31.8b
234    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7;
235    sqrshrun  v26.8b, v20.8h, #6
236    umull     v24.8h, v11.8b, v28.8b
237    umlal     v24.8h, v14.8b, v29.8b
238    ext       v8.8b, v5.8b , v6.8b , #2
239    umlal     v24.8h, v1.8b, v30.8b
240    umlal     v24.8h, v4.8b, v31.8b
241    ext       v9.8b, v6.8b , v7.8b , #2
242    sqrshrun  v27.8b, v24.8h, #6
243    st1       {v26.8b, v27.8b}, [x1], x3 ////Store dest row5
244    umull     v20.8h, v0.8b, v28.8b
245    umlal     v20.8h, v3.8b, v29.8b
246    umlal     v20.8h, v5.8b, v30.8b
247    umlal     v20.8h, v8.8b, v31.8b
248    ld1       {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row8 ;
249    sqrshrun  v26.8b, v20.8h, #6
250    umull     v22.8h, v1.8b, v28.8b
251    umlal     v22.8h, v4.8b, v29.8b
252    umlal     v22.8h, v6.8b, v30.8b
253    ext       v13.8b, v10.8b , v11.8b , #2
254    umlal     v22.8h, v9.8b, v31.8b
255    ext       v14.8b, v11.8b , v12.8b , #2
256    sqrshrun  v27.8b, v22.8h, #6
257    st1       { v26.8b, v27.8b}, [x1], x3 ////Store dest row6
258    umull     v24.8h, v5.8b, v28.8b
259    umlal     v24.8h, v8.8b, v29.8b
260    umlal     v24.8h, v10.8b, v30.8b
261    umlal     v24.8h, v13.8b, v31.8b
262    umull     v16.8h, v6.8b, v28.8b
263    sqrshrun  v18.8b, v24.8h, #6
264    umlal     v16.8h, v9.8b, v29.8b
265    umlal     v16.8h, v11.8b, v30.8b
266    umlal     v16.8h, v14.8b, v31.8b
267    sqrshrun  v19.8b, v16.8h, #6
268    st1       { v18.8b, v19.8b}, [x1], x3 // store row 7
269    b         end_func
270
271loop_4:
272    ld1       {v0.8b, v1.8b}, [x0], x2  //// Load row0 ;
273    ext       v2.8b, v0.8b , v1.8b , #2
274    ld1       {v3.8b, v4.8b}, [x0], x2  //// Load row1;
275    ext       v5.8b, v3.8b , v4.8b , #2
276    umull     v20.8h, v0.8b, v28.8b
277    umlal     v20.8h, v2.8b, v29.8b
278    umlal     v20.8h, v3.8b, v30.8b
279    umlal     v20.8h, v5.8b, v31.8b
280    ld1       {v6.8b, v7.8b}, [x0], x2  //// Load row2
281    sqrshrun  v26.8b, v20.8h, #6
282    ext       v8.8b, v6.8b , v7.8b , #2
283    st1       {v26.8b}, [x1], x3        ////Store dest row0
284    umull     v22.8h, v3.8b, v28.8b
285    umlal     v22.8h, v5.8b, v29.8b
286    umlal     v22.8h, v6.8b, v30.8b
287    umlal     v22.8h, v8.8b, v31.8b
288    subs      x12, x6, #2
289    sqrshrun  v27.8b, v22.8h, #6
290    st1       {v27.8b}, [x1], x3        ////Store dest row1
291    beq       end_func                  //If ht=2
292
293    ld1       {v9.8b, v10.8b}, [x0], x2 //// Load row3;
294    ext       v11.8b, v9.8b , v10.8b , #2
295    umull     v24.8h, v6.8b, v28.8b
296    umlal     v24.8h, v8.8b, v29.8b
297    umlal     v24.8h, v9.8b, v30.8b
298    umlal     v24.8h, v11.8b, v31.8b
299    ld1       {v0.8b, v1.8b}, [x0], x2  //// Load row4 ;
300    sqrshrun  v16.8b, v24.8h, #6
301    ext       v2.8b, v0.8b , v1.8b , #2
302    st1       {v16.8b}, [x1], x3        ////Store dest row2
303    umull     v18.8h, v9.8b, v28.8b
304    umlal     v18.8h, v11.8b, v29.8b
305    umlal     v18.8h, v0.8b, v30.8b
306    umlal     v18.8h, v2.8b, v31.8b
307    subs      x12, x6, #4
308    sqrshrun  v17.8b, v18.8h, #6
309    st1       {v17.8b}, [x1], x3        ////Store dest row3
310    beq       end_func                  //If ht=4
311
312    ld1       {v3.8b, v4.8b}, [x0], x2  //// Load row5;
313    ext       v5.8b, v3.8b , v4.8b , #2
314    umull     v20.8h, v0.8b, v28.8b
315    umlal     v20.8h, v2.8b, v29.8b
316    umlal     v20.8h, v3.8b, v30.8b
317    umlal     v20.8h, v5.8b, v31.8b
318    ld1       {v6.8b, v7.8b}, [x0], x2  //// Load row6 ;
319    sqrshrun  v26.8b, v20.8h, #6
320    ext       v8.8b, v6.8b , v7.8b , #2
321    st1       {v26.8b}, [x1], x3        ////Store dest row4
322    umull     v22.8h, v3.8b, v28.8b
323    umlal     v22.8h, v5.8b, v29.8b
324    umlal     v22.8h, v6.8b, v30.8b
325    umlal     v22.8h, v8.8b, v31.8b
326    ld1       {v9.8b, v10.8b}, [x0], x2 //// Load row7;
327    sqrshrun  v27.8b, v22.8h, #6
328    ext       v11.8b, v9.8b , v10.8b , #2
329    st1       {v27.8b}, [x1], x3        ////Store dest row5
330    umull     v24.8h, v6.8b, v28.8b
331    umlal     v24.8h, v8.8b, v29.8b
332    umlal     v24.8h, v9.8b, v30.8b
333    umlal     v24.8h, v11.8b, v31.8b
334    ld1       {v0.8b, v1.8b}, [x0], x2  //// Load row8;
335    sqrshrun  v16.8b, v24.8h, #6
336    ext       v2.8b, v0.8b , v1.8b , #2
337    st1       {v16.8b}, [x1], x3        ////Store dest row6
338    umull     v18.8h, v9.8b, v28.8b
339    umlal     v18.8h, v11.8b, v29.8b
340    umlal     v18.8h, v0.8b, v30.8b
341    umlal     v18.8h, v2.8b, v31.8b
342    sqrshrun  v17.8b, v18.8h, #6
343    st1       {v17.8b}, [x1], x3        ////Store dest row7
344    b         end_func
345
346loop_2:
347    ld1       {v0.8b}, [x0], x2         //// Load row0 ;
348    ext       v2.8b, v0.8b , v0.8b , #2
349    ld1       {v3.8b}, [x0], x2         //// Load row1;
350    ext       v5.8b, v3.8b , v3.8b , #2
351    umull     v20.8h, v0.8b, v28.8b
352    umlal     v20.8h, v2.8b, v29.8b
353    umlal     v20.8h, v3.8b, v30.8b
354    umlal     v20.8h, v5.8b, v31.8b
355    ld1       {v6.8b}, [x0], x2         //// Load row2
356    sqrshrun  v26.8b, v20.8h, #6
357    ext       v8.8b, v6.8b , v6.8b , #2
358    st1       {v26.s}[0], [x1], x3      ////Store dest row0
359    umull     v22.8h, v3.8b, v28.8b
360    umlal     v22.8h, v5.8b, v29.8b
361    umlal     v22.8h, v6.8b, v30.8b
362    umlal     v22.8h, v8.8b, v31.8b
363    subs      x12, x6, #2
364    sqrshrun  v27.8b, v22.8h, #6
365    st1       {v27.s}[0], [x1], x3      ////Store dest row1
366    beq       end_func                  //If ht=2
367
368    ld1       {v9.8b}, [x0], x2         //// Load row3;
369    ext       v11.8b, v9.8b , v9.8b , #2
370    umull     v24.8h, v6.8b, v28.8b
371    umlal     v24.8h, v8.8b, v29.8b
372    umlal     v24.8h, v9.8b, v30.8b
373    umlal     v24.8h, v11.8b, v31.8b
374    ld1       {v0.8b}, [x0], x2         //// Load row4 ;
375    sqrshrun  v16.8b, v24.8h, #6
376    ext       v2.8b, v0.8b , v0.8b , #2
377    st1       {v16.s}[0], [x1], x3      ////Store dest row2
378    umull     v18.8h, v9.8b, v28.8b
379    umlal     v18.8h, v11.8b, v29.8b
380    umlal     v18.8h, v0.8b, v30.8b
381    umlal     v18.8h, v2.8b, v31.8b
382    sqrshrun  v17.8b, v18.8h, #6
383    st1       {v17.s}[0], [x1], x3      ////Store dest row3
384
385
386end_func:
387    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
388    ldp       x19, x20, [sp], #16
389    pop_v_regs
390    ret
391
392
393