1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20
21@/*
22@//----------------------------------------------------------------------------
23@// File Name            : impeg2_format_conv.s
24@//
25@// Description          : This file has the Idct Implementations for the
26@//                        MPEG4 SP decoder on neon platform.
27@//
28@// Reference Document   :
29@//
30@// Revision History     :
31@//      Date            Author                  Detail Description
32@//   ------------    ----------------    ----------------------------------
33@//   Jul 07, 2008     Naveen Kumar T                Created
34@//
35@//-------------------------------------------------------------------------
36@*/
37
38@/*
39@// ----------------------------------------------------------------------------
40@// Include Files
41@// ----------------------------------------------------------------------------
42@*/
43.text
44.p2align 2
45.equ log2_16 ,  4
46.equ log2_2  ,  1
47@/*
48@// ----------------------------------------------------------------------------
49@// Struct/Union Types and Define
50@// ----------------------------------------------------------------------------
51@*/
52
53@/*
54@// ----------------------------------------------------------------------------
55@// Static Global Data section variables
56@// ----------------------------------------------------------------------------
57@*/
58@//--------------------------- NONE --------------------------------------------
59
60@/*
61@// ----------------------------------------------------------------------------
62@// Static Prototype Functions
63@// ----------------------------------------------------------------------------
64@*/
65@// -------------------------- NONE --------------------------------------------
66
67@/*
68@// ----------------------------------------------------------------------------
69@// Exported functions
70@// ----------------------------------------------------------------------------
71@*/
72
73@/*****************************************************************************
74@*                                                                            *
75@*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q()                      *
76@*                                                                            *
77@*  Description      : This function conversts the image from YUV420P color   *
78@*                     space to 420SP color space(UV interleaved).        *
79@*                                                                            *
80@*  Arguments        : R0           pu1_y                                     *
81@*                     R1           pu1_u                                     *
82@*                     R2           pu1_v                                     *
83@*                     R3           pu1_dest_y                                *
84@*                     [R13 #40]    pu1_dest_uv                               *
85@*                     [R13 #44]    u2_height                                 *
86@*                     [R13 #48]    u2_width                                  *
87@*                     [R13 #52]    u2_stridey                                *
88@*                     [R13 #56]    u2_strideu                                *
89@*                     [R13 #60]    u2_stridev                                *
90@*                     [R13 #64]    u2_dest_stride_y                          *
91@*                     [R13 #68]    u2_dest_stride_uv                         *
92@*                     [R13 #72]    convert_uv_only                           *
93@*                                                                            *
94@*  Values Returned  : None                                                   *
95@*                                                                            *
96@*  Register Usage   : R0 - R8, Q0                                            *
97@*                                                                            *
98@*  Stack Usage      : 24 Bytes                                               *
99@*                                                                            *
100@*  Interruptibility : Interruptible                                          *
101@*                                                                            *
102@*  Known Limitations                                                         *
103@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
104@*                     greater than or equal to 16                *
105@*                     Image Height:    Assumed to be even.                   *
106@*                                                                            *
107@*  Revision History :                                                        *
108@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
109@*         07 06 2010   Varshita        Draft                                 *
110@*         07 06 2010   Naveen Kr T     Completed                             *
111@*                                                                            *
112@*****************************************************************************/
113                .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q
114impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q:
115
116    @// push the registers on the stack
117    stmfd           sp!, {r4-r8, lr}
118
119    ldr             r4, [sp, #56]       @// Load convert_uv_only
120
121    cmp             r4, #1
122    beq             yuv420sp_uv_chroma
123    @/* Do the preprocessing before the main loops start */
124    @// Load the parameters from stack
125    ldr             r4, [sp, #28]       @// Load u2_height from stack
126
127    ldr             r5, [sp, #32]       @// Load u2_width from stack
128
129    ldr             r7, [sp, #36]       @// Load u2_stridey from stack
130
131    ldr             r8, [sp, #48]       @// Load u2_dest_stride_y from stack
132
133    sub             r7, r7, r5          @// Source increment
134
135    sub             r8, r8, r5          @// Destination increment
136
137
138yuv420sp_uv_row_loop_y:
139    mov             r6, r5
140
141yuv420sp_uv_col_loop_y:
142    pld             [r0, #128]
143    vld1.8          {q0}, [r0]!
144    vst1.8          {q0}, [r3]!
145    sub             r6, r6, #16
146    cmp             r6, #15
147    bgt             yuv420sp_uv_col_loop_y
148
149    cmp             r6, #0
150    beq             yuv420sp_uv_row_loop_end_y
151    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
152    @//Ex if width is 162, above loop will process 160 pixels. And
153    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
154    @// and written using VLD1 and VST1
155    rsb             r6, r6, #16
156    sub             r0, r0, r6
157    sub             r3, r3, r6
158
159    vld1.8          {q0}, [r0]!
160    vst1.8          {q0}, [r3]!
161
162yuv420sp_uv_row_loop_end_y:
163    add             r0, r0, r7
164    add             r3, r3, r8
165    subs            r4, r4, #1
166    bgt             yuv420sp_uv_row_loop_y
167
168yuv420sp_uv_chroma:
169
170    ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
171
172    ldr             r4, [sp, #28]       @// Load u2_height from stack
173
174    ldr             r5, [sp, #32]       @// Load u2_width from stack
175
176
177    ldr             r7, [sp, #40]       @// Load u2_strideu from stack
178
179    ldr             r8, [sp, #52]       @// Load u2_dest_stride_uv from stack
180
181    sub             r7, r7, r5, lsr #1  @// Source increment
182
183    sub             r8, r8, r5          @// Destination increment
184
185    mov             r5, r5, lsr #1
186    mov             r4, r4, lsr #1
187    ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
188yuv420sp_uv_row_loop_uv:
189    mov             r6, r5
190
191
192yuv420sp_uv_col_loop_uv:
193    pld             [r1, #128]
194    pld             [r2, #128]
195    vld1.8          d0, [r1]!
196    vld1.8          d1, [r2]!
197    vst2.8          {d0, d1}, [r3]!
198    sub             r6, r6, #8
199    cmp             r6, #7
200    bgt             yuv420sp_uv_col_loop_uv
201
202    cmp             r6, #0
203    beq             yuv420sp_uv_row_loop_end_uv
204    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
205    @//Ex if width is 162, above loop will process 160 pixels. And
206    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
207    @// and written using VLD1 and VST1
208    rsb             r6, r6, #8
209    sub             r1, r1, r6
210    sub             r2, r2, r6
211    sub             r3, r3, r6, lsl #1
212
213    vld1.8          d0, [r1]!
214    vld1.8          d1, [r2]!
215    vst2.8          {d0, d1}, [r3]!
216
217yuv420sp_uv_row_loop_end_uv:
218    add             r1, r1, r7
219    add             r2, r2, r7
220    add             r3, r3, r8
221    subs            r4, r4, #1
222    bgt             yuv420sp_uv_row_loop_uv
223    @//POP THE REGISTERS
224    ldmfd           sp!, {r4-r8, pc}
225
226
227
228
229
230@/*****************************************************************************
231@*                                                                            *
232@*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q()                      *
233@*                                                                            *
234@*  Description      : This function conversts the image from YUV420P color   *
235@*                     space to 420SP color space(VU interleaved).        *
236@*             This function is similar to above function         *
237@*             IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in   *
238@*             VLD1.8 for chroma - order of registers is different    *
239@*                                                                            *
240@*  Arguments        : R0           pu1_y                                     *
241@*                     R1           pu1_u                                     *
242@*                     R2           pu1_v                                     *
243@*                     R3           pu1_dest_y                                *
244@*                     [R13 #40]    pu1_dest_uv                               *
245@*                     [R13 #44]    u2_height                                 *
246@*                     [R13 #48]    u2_width                                  *
247@*                     [R13 #52]    u2_stridey                                *
248@*                     [R13 #56]    u2_strideu                                *
249@*                     [R13 #60]    u2_stridev                                *
250@*                     [R13 #64]    u2_dest_stride_y                          *
251@*                     [R13 #68]    u2_dest_stride_uv                         *
252@*                     [R13 #72]    convert_uv_only                           *
253@*                                                                            *
254@*  Values Returned  : None                                                   *
255@*                                                                            *
256@*  Register Usage   : R0 - R8, Q0                                            *
257@*                                                                            *
258@*  Stack Usage      : 24 Bytes                                               *
259@*                                                                            *
260@*  Interruptibility : Interruptible                                          *
261@*                                                                            *
262@*  Known Limitations                                                         *
263@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
264@*                     greater than or equal to 16                *
265@*                     Image Height:    Assumed to be even.                   *
266@*                                                                            *
267@*  Revision History :                                                        *
268@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
269@*         07 06 2010   Varshita        Draft                                 *
270@*         07 06 2010   Naveen Kr T     Completed                             *
271@*                                                                            *
272@*****************************************************************************/
273
274                .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q
275impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q:
276
277    @// push the registers on the stack
278    stmfd           sp!, {r4-r8, lr}
279
280    ldr             r4, [sp, #56]       @// Load convert_uv_only
281
282    cmp             r4, #1
283    beq             yuv420sp_vu_chroma
284
285    @/* Do the preprocessing before the main loops start */
286    @// Load the parameters from stack
287    ldr             r4, [sp, #28]       @// Load u2_height from stack
288
289    ldr             r5, [sp, #32]       @// Load u2_width from stack
290
291    ldr             r7, [sp, #36]       @// Load u2_stridey from stack
292
293    ldr             r8, [sp, #48]       @// Load u2_dest_stride_y from stack
294
295    sub             r7, r7, r5          @// Source increment
296
297    sub             r8, r8, r5          @// Destination increment
298
299
300yuv420sp_vu_row_loop_y:
301    mov             r6, r5
302
303yuv420sp_vu_col_loop_y:
304    pld             [r0, #128]
305    vld1.8          {q0}, [r0]!
306    vst1.8          {q0}, [r3]!
307    sub             r6, r6, #16
308    cmp             r6, #15
309    bgt             yuv420sp_vu_col_loop_y
310
311    cmp             r6, #0
312    beq             yuv420sp_vu_row_loop_end_y
313    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
314    @//Ex if width is 162, above loop will process 160 pixels. And
315    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
316    @// and written using VLD1 and VST1
317    rsb             r6, r6, #16
318    sub             r0, r0, r6
319    sub             r3, r3, r6
320
321    vld1.8          {q0}, [r0]!
322    vst1.8          {q0}, [r3]!
323
324yuv420sp_vu_row_loop_end_y:
325    add             r0, r0, r7
326    add             r3, r3, r8
327    subs            r4, r4, #1
328    bgt             yuv420sp_vu_row_loop_y
329
330yuv420sp_vu_chroma:
331
332    ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
333
334    ldr             r4, [sp, #28]       @// Load u2_height from stack
335
336    ldr             r5, [sp, #32]       @// Load u2_width from stack
337
338
339    ldr             r7, [sp, #40]       @// Load u2_strideu from stack
340
341    ldr             r8, [sp, #52]       @// Load u2_dest_stride_uv from stack
342
343    sub             r7, r7, r5, lsr #1  @// Source increment
344
345    sub             r8, r8, r5          @// Destination increment
346
347    mov             r5, r5, lsr #1
348    mov             r4, r4, lsr #1
349    ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
350yuv420sp_vu_row_loop_uv:
351    mov             r6, r5
352
353
354yuv420sp_vu_col_loop_uv:
355    pld             [r1, #128]
356    pld             [r2, #128]
357    vld1.8          d1, [r1]!
358    vld1.8          d0, [r2]!
359    vst2.8          {d0, d1}, [r3]!
360    sub             r6, r6, #8
361    cmp             r6, #7
362    bgt             yuv420sp_vu_col_loop_uv
363
364    cmp             r6, #0
365    beq             yuv420sp_vu_row_loop_end_uv
366    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
367    @//Ex if width is 162, above loop will process 160 pixels. And
368    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
369    @// and written using VLD1 and VST1
370    rsb             r6, r6, #8
371    sub             r1, r1, r6
372    sub             r2, r2, r6
373    sub             r3, r3, r6, lsl #1
374
375    vld1.8          d1, [r1]!
376    vld1.8          d0, [r2]!
377    vst2.8          {d0, d1}, [r3]!
378
379yuv420sp_vu_row_loop_end_uv:
380    add             r1, r1, r7
381    add             r2, r2, r7
382    add             r3, r3, r8
383    subs            r4, r4, #1
384    bgt             yuv420sp_vu_row_loop_uv
385    @//POP THE REGISTERS
386    ldmfd           sp!, {r4-r8, pc}
387
388
389
390
391
392