1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21///*
22////----------------------------------------------------------------------------
23//// File Name            : impeg2_format_conv.s
24////
25//// Description          : This file has the Idct Implementations for the
26////                        MPEG4 SP decoder on neon platform.
27////
28//// Reference Document   :
29////
30//// Revision History     :
31////      Date            Author                  Detail Description
32////   ------------    ----------------    ----------------------------------
33////   Jul 07, 2008     Naveen Kumar T                Created
34////
35////-------------------------------------------------------------------------
36//*/
37
38///*
39//// ----------------------------------------------------------------------------
40//// Include Files
41//// ----------------------------------------------------------------------------
42//*/
43.set log2_16                    ,      4
44.set log2_2                     ,      1
45
46.text
47.include "impeg2_neon_macros.s"
48///*
49//// ----------------------------------------------------------------------------
50//// Struct/Union Types and Define
51//// ----------------------------------------------------------------------------
52//*/
53
54///*
55//// ----------------------------------------------------------------------------
56//// Static Global Data section variables
57//// ----------------------------------------------------------------------------
58//*/
59////--------------------------- NONE --------------------------------------------
60
61///*
62//// ----------------------------------------------------------------------------
63//// Static Prototype Functions
64//// ----------------------------------------------------------------------------
65//*/
66//// -------------------------- NONE --------------------------------------------
67
68///*
69//// ----------------------------------------------------------------------------
70//// Exported functions
71//// ----------------------------------------------------------------------------
72//*/
73
74
75///*****************************************************************************
76//*                                                                            *
77//*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8()                      *
78//*                                                                            *
79//*  Description      : This function conversts the image from YUV420P color   *
80//*                     space to 420SP color space(UV interleaved).           *
81//*                                                                            *
82//*  Arguments        : x0          pu1_y                                     *
83//*                     x1          pu1_u                                     *
84//*                     x2          pu1_v                                     *
85//*                     x3          pu1_dest_y                                *
86//*                     x4          pu1_dest_uv                               *
87//*                     x5          u2_height                                 *
88//*                     x6          u2_width                                  *
89//*                     x7          u2_stridey                                *
90//*                     sp, #80     u2_strideu                                *
91//*                     sp, #88     u2_stridev                                *
92//*                     sp, #96     u2_dest_stride_y                          *
93//*                     sp, #104    u2_dest_stride_uv                         *
94//*                     sp, #112    convert_uv_only                           *
95//*                                                                            *
96//*  Values Returned  : None                                                   *
97//*                                                                            *
98//*  Register Usage   : x8, x10, x16, x20, v0, v1                              *
99//*                                                                            *
100//*  Stack Usage      : 80 Bytes                                               *
101//*                                                                            *
102//*  Interruptibility : Interruptible                                          *
103//*                                                                            *
104//*  Known Limitations                                                         *
105//*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
106//*                     greater than or equal to 16                  *
107//*                     Image Height:    Assumed to be even.                   *
108//*                                                                            *
109//*  Revision History :                                                        *
110//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
111//*         07 06 2010   Varshita        Draft                                 *
112//*         07 06 2010   Naveen Kr T     Completed                             *
113//*                                                                            *
114//*****************************************************************************/
115.global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8
116impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8:
117
118    //// push the registers on the stack
119    //    pu1_y,                - x0
120    //    pu1_u,                - x1
121    //    pu1_v,                - x2
122    //    pu1_dest_y,           - x3
123    //    pu1_dest_uv,          - x4
124    //    u2_height,            - x5
125    //    u2_width,             - x6
126    //    u2_stridey,           - x7
127    //    u2_strideu,           - sp, #80
128    //    u2_stridev,           - sp, #88
129    //    u2_dest_stride_y,     - sp, #96
130    //    u2_dest_stride_uv,    - sp, #104
131    //    convert_uv_only       - sp, #112
132    // STMFD sp!,{x4-x12,x14}
133    push_v_regs
134    stp             x19, x20, [sp, #-16]!
135
136    ldr             w14, [sp, #112]     //// Load convert_uv_only
137
138    cmp             w14, #1
139    beq             yuv420sp_uv_chroma
140    ///* Do the preprocessing before the main loops start */
141    //// Load the parameters from stack
142
143    ldr             w8, [sp, #96]       //// Load u2_dest_stride_y from stack
144    uxtw            x8, w8
145
146    sub             x7, x7, x6          //// Source increment
147
148    sub             x8, x8, x6          //// Destination increment
149
150
151yuv420sp_uv_row_loop_y:
152    mov             x16, x6
153
154yuv420sp_uv_col_loop_y:
155    prfm            pldl1keep, [x0, #128]
156    ld1             {v0.8b, v1.8b}, [x0], #16
157    st1             {v0.8b, v1.8b}, [x3], #16
158    sub             x16, x16, #16
159    cmp             x16, #15
160    bgt             yuv420sp_uv_col_loop_y
161
162    cmp             x16, #0
163    beq             yuv420sp_uv_row_loop__y
164    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
165    ////Ex if width is 162, above loop will process 160 pixels. And
166    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
167    //// and written using VLD1 and VST1
168    sub             x20, x16, #16
169    neg             x16, x20
170    sub             x0, x0, x16
171    sub             x3, x3, x16
172
173    ld1             {v0.8b, v1.8b}, [x0], #16
174    st1             {v0.8b, v1.8b}, [x3], #16
175
176yuv420sp_uv_row_loop__y:
177    add             x0, x0, x7
178    add             x3, x3, x8
179    subs            x5, x5, #1
180    bgt             yuv420sp_uv_row_loop_y
181
182yuv420sp_uv_chroma:
183    ldr             w7, [sp, #88]       //// Load u2_strideu from stack
184    sxtw            x7, w7
185
186    ldr             w8, [sp, #104]      //// Load u2_dest_stride_uv from stack
187    sxtw            x8, w8
188
189    sub             x7, x7, x6, lsr #1  //// Source increment
190
191    sub             x8, x8, x6          //// Destination increment
192
193    lsr             x6, x6, #1
194    lsr             x5, x5, #1
195yuv420sp_uv_row_loop_uv:
196    mov             x16, x6
197
198
199yuv420sp_uv_col_loop_uv:
200    prfm            pldl1keep, [x1, #128]
201    prfm            pldl1keep, [x2, #128]
202
203    ld1             {v0.8b}, [x1], #8
204    ld1             {v1.8b}, [x2], #8
205    st2             {v0.8b, v1.8b}, [x4], #16
206
207    sub             x16, x16, #8
208    cmp             x16, #7
209    bgt             yuv420sp_uv_col_loop_uv
210
211    cmp             x16, #0
212    beq             yuv420sp_uv_row_loop__uv
213    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
214    ////Ex if width is 162, above loop will process 160 pixels. And
215    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
216    //// and written using VLD1 and VST1
217    sub             x20, x16, #8
218    neg             x16, x20
219    sub             x1, x1, x16
220    sub             x2, x2, x16
221    sub             x4, x4, x16, lsl #1
222
223    ld1             {v0.8b}, [x1], #8
224    ld1             {v1.8b}, [x2], #8
225    st2             {v0.8b, v1.8b}, [x4], #16
226
227yuv420sp_uv_row_loop__uv:
228    add             x1, x1, x7
229    add             x2, x2, x7
230    add             x4, x4, x8
231    subs            x5, x5, #1
232    bgt             yuv420sp_uv_row_loop_uv
233    ////POP THE REGISTERS
234    // LDMFD sp!,{x4-x12,PC}
235    ldp             x19, x20, [sp], #16
236    pop_v_regs
237    ret
238
239
240
241
242
243///*****************************************************************************
244//*                                                                            *
245//*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8()                      *
246//*                                                                            *
247//*  Description      : This function conversts the image from YUV420P color   *
248//*                     space to 420SP color space(VU interleaved).           *
249//*               This function is similar to above function          *
250//*               IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in   *
251//*               VLD1.8 for chroma - order of registers is different    *
252//*                                                                            *
253//*  Arguments        : x0          pu1_y                                     *
254//*                     x1          pu1_u                                     *
255//*                     x2          pu1_v                                     *
256//*                     x3          pu1_dest_y                                *
257//*                     x4          pu1_dest_uv                               *
258//*                     x5          u2_height                                 *
259//*                     x6          u2_width                                  *
260//*                     x7          u2_stridey                                *
261//*                     sp, #80     u2_strideu                                *
262//*                     sp, #88     u2_stridev                                *
263//*                     sp, #96     u2_dest_stride_y                          *
264//*                     sp, #104    u2_dest_stride_uv                         *
265//*                     sp, #112    convert_uv_only                           *
266//*                                                                            *
267//*  Values Returned  : None                                                   *
268//*                                                                            *
269//*  Register Usage   : x8, x14, x16, x20, v0, v1                              *
270//*                                                                            *
271//*  Stack Usage      : 80 Bytes                                               *
272//*                                                                            *
273//*  Interruptibility : Interruptible                                          *
274//*                                                                            *
275//*  Known Limitations                                                         *
276//*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
277//*                     greater than or equal to 16                  *
278//*                     Image Height:    Assumed to be even.                   *
279//*                                                                            *
280//*  Revision History :                                                        *
281//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
282//*         07 06 2010   Varshita        Draft                                 *
283//*         07 06 2010   Naveen Kr T     Completed                             *
284//*                                                                            *
285//*****************************************************************************/
286
287.global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8
288impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8:
289
290    //// push the registers on the stack
291    //    pu1_y,                - x0
292    //    pu1_u,                - x1
293    //    pu1_v,                - x2
294    //    pu1_dest_y,           - x3
295    //    pu1_dest_uv,          - x4
296    //    u2_height,            - x5
297    //    u2_width,             - x6
298    //    u2_stridey,           - x7
299    //    u2_strideu,           - sp, #80
300    //    u2_stridev,           - sp, #88
301    //    u2_dest_stride_y,     - sp, #96
302    //    u2_dest_stride_uv,    - sp, #104
303    //    convert_uv_only       - sp, #112
304    // STMFD sp!,{x4-x12,x14}
305    push_v_regs
306    stp             x19, x20, [sp, #-16]!
307
308    ldr             w14, [sp, #112]     //// Load convert_uv_only
309
310    cmp             w14, #1
311    beq             yuv420sp_vu_chroma
312
313    ///* Do the preprocessing before the main loops start */
314    //// Load the parameters from stack
315
316    ldr             w8, [sp, #96]       //// Load u2_dest_stride_y from stack
317    uxtw            x8, w8
318
319    sub             x7, x7, x6          //// Source increment
320
321    sub             x8, x8, x6          //// Destination increment
322
323
324yuv420sp_vu_row_loop_y:
325    mov             x16, x6
326
327yuv420sp_vu_col_loop_y:
328    prfm            pldl1keep, [x0, #128]
329    ld1             {v0.8b, v1.8b}, [x0], #16
330    st1             {v0.8b, v1.8b}, [x3], #16
331    sub             x16, x16, #16
332    cmp             x16, #15
333    bgt             yuv420sp_vu_col_loop_y
334
335    cmp             x16, #0
336    beq             yuv420sp_vu_row_loop__y
337    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
338    ////Ex if width is 162, above loop will process 160 pixels. And
339    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
340    //// and written using VLD1 and VST1
341    sub             x20, x16, #16
342    neg             x16, x20
343    sub             x0, x0, x16
344    sub             x3, x3, x16
345
346    ld1             {v0.8b, v1.8b}, [x0], #16
347    st1             {v0.8b, v1.8b}, [x3], #16
348
349yuv420sp_vu_row_loop__y:
350    add             x0, x0, x7
351    add             x3, x3, x8
352    subs            x5, x5, #1
353    bgt             yuv420sp_vu_row_loop_y
354
355yuv420sp_vu_chroma:
356    ldr             w7, [sp, #80]       //// Load u2_strideu from stack
357    sxtw            x7, w7
358
359    ldr             w8, [sp, #104]      //// Load u2_dest_stride_uv from stack
360    sxtw            x8, w8
361
362    sub             x7, x7, x6, lsr #1  //// Source increment
363
364    sub             x8, x8, x6          //// Destination increment
365
366    lsr             x6, x6, #1
367    lsr             x5, x5, #1
368yuv420sp_vu_row_loop_uv:
369    mov             x16, x6
370
371
372yuv420sp_vu_col_loop_uv:
373    prfm            pldl1keep, [x1, #128]
374    prfm            pldl1keep, [x2, #128]
375    ld1             {v1.8b}, [x1], #8
376    ld1             {v0.8b}, [x2], #8
377    st2             {v0.8b, v1.8b}, [x4], #16
378    sub             x16, x16, #8
379    cmp             x16, #7
380    bgt             yuv420sp_vu_col_loop_uv
381
382    cmp             x16, #0
383    beq             yuv420sp_vu_row_loop__uv
384    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
385    ////Ex if width is 162, above loop will process 160 pixels. And
386    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
387    //// and written using VLD1 and VST1
388    sub             x20, x16, #8
389    neg             x16, x20
390    sub             x1, x1, x16
391    sub             x2, x2, x16
392    sub             x4, x4, x16, lsl #1
393
394    ld1             {v1.8b}, [x1], #8
395    ld1             {v0.8b}, [x2], #8
396    st2             {v0.8b, v1.8b}, [x4], #16
397
398yuv420sp_vu_row_loop__uv:
399    add             x1, x1, x7
400    add             x2, x2, x7
401    add             x4, x4, x8
402    subs            x5, x5, #1
403    bgt             yuv420sp_vu_row_loop_uv
404    ////POP THE REGISTERS
405    // LDMFD sp!,{x4-x12,PC}
406    ldp             x19, x20, [sp], #16
407    pop_v_regs
408    ret
409
410