1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20
21.text
22.p2align 2
23
24@/*****************************************************************************
25@*                                                                            *
26@*  Function Name    : IH264D_CXA8_YUV420toYUV420SP_UV()                      *
27@*                                                                            *
28@*  Description      : This function conversts the image from YUV420P color   *
29@*                     space to 420SP color space(UV interleaved).        *
30@*                                                                            *
31@*  Arguments        : R0           pu1_y                                     *
32@*                     R1           pu1_u                                     *
33@*                     R2           pu1_v                                     *
34@*                     R3           pu1_dest_y                                *
35@*                     [R13 #40]    pu1_dest_uv                               *
36@*                     [R13 #44]    u2_height                                 *
37@*                     [R13 #48]    u2_width                                  *
38@*                     [R13 #52]    u2_stridey                                *
39@*                     [R13 #56]    u2_strideu                                *
40@*                     [R13 #60]    u2_stridev                                *
41@*                     [R13 #64]    u2_dest_stride_y                          *
42@*                     [R13 #68]    u2_dest_stride_uv                         *
43@*                     [R13 #72]    convert_uv_only                           *
44@*                                                                            *
45@*  Values Returned  : None                                                   *
46@*                                                                            *
47@*  Register Usage   : R0 - R14                                               *
48@*                                                                            *
49@*  Stack Usage      : 40 Bytes                                               *
50@*                                                                            *
51@*  Interruptibility : Interruptible                                          *
52@*                                                                            *
53@*  Known Limitations                                                         *
54@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
55@*                     greater than or equal to 16                *
56@*                     Image Height:    Assumed to be even.                   *
57@*                                                                            *
58@*  Revision History :                                                        *
59@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
60@*         07 06 2010   Varshita        Draft                                 *
61@*         07 06 2010   Naveen Kr T     Completed                             *
62@*                                                                            *
63@*****************************************************************************/
64    .global ih264e_fmt_conv_420p_to_420sp_a9q
65
66ih264e_fmt_conv_420p_to_420sp_a9q:
67
68    @// push the registers on the stack
69    stmfd         sp!, {r4-r12, lr}
70
71    ldr           r4, [sp, #72]         @// Load convert_uv_only
72
73    cmp           r4, #1
74    beq           yuv420sp_uv_chroma
75    @/* Do the preprocessing before the main loops start */
76    @// Load the parameters from stack
77    ldr           r4, [sp, #44]         @// Load u2_height from stack
78    ldr           r5, [sp, #48]         @// Load u2_width from stack
79    ldr           r7, [sp, #52]         @// Load u2_stridey from stack
80    ldr           r8, [sp, #64]         @// Load u2_dest_stride_y from stack
81    sub           r7, r7, r5            @// Source increment
82    sub           r8, r8, r5            @// Destination increment
83
84yuv420sp_uv_row_loop_y:
85    mov           r6, r5
86
87yuv420sp_uv_col_loop_y:
88    pld           [r0, #128]
89    vld1.8        {d0, d1}, [r0]!
90    vst1.8        {d0, d1}, [r3]!
91    sub           r6, r6, #16
92    cmp           r6, #15
93    bgt           yuv420sp_uv_col_loop_y
94
95    cmp           r6, #0
96    beq           yuv420sp_uv_row_loop_end_y
97    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
98    @//Ex if width is 162, above loop will process 160 pixels. And
99    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
100    @// and written using VLD1 and VST1
101    rsb           r6, r6, #16
102    sub           r0, r0, r6
103    sub           r3, r3, r6
104
105    vld1.8        {d0, d1}, [r0]!
106    vst1.8        {d0, d1}, [r3]!
107
108yuv420sp_uv_row_loop_end_y:
109    add           r0, r0, r7
110    add           r3, r3, r8
111    subs          r4, r4, #1
112    bgt           yuv420sp_uv_row_loop_y
113
114yuv420sp_uv_chroma:
115
116    ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack
117
118    ldr           r4, [sp, #44]         @// Load u2_height from stack
119
120    ldr           r5, [sp, #48]         @// Load u2_width from stack
121
122
123    ldr           r7, [sp, #56]         @// Load u2_strideu from stack
124
125    ldr           r8, [sp, #68]         @// Load u2_dest_stride_uv from stack
126
127    sub           r7, r7, r5, lsr #1    @// Source increment
128
129    sub           r8, r8, r5            @// Destination increment
130
131    mov           r5, r5, lsr #1
132    mov           r4, r4, lsr #1
133    ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack
134
135yuv420sp_uv_row_loop_uv:
136    mov           r6, r5
137
138
139yuv420sp_uv_col_loop_uv:
140    pld           [r1, #128]
141    pld           [r2, #128]
142    vld1.8        d0, [r1]!
143    vld1.8        d1, [r2]!
144    vst2.8        {d0, d1}, [r3]!
145    sub           r6, r6, #8
146    cmp           r6, #7
147    bgt           yuv420sp_uv_col_loop_uv
148
149    cmp           r6, #0
150    beq           yuv420sp_uv_row_loop_end_uv
151    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
152    @//Ex if width is 162, above loop will process 160 pixels. And
153    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
154    @// and written using VLD1 and VST1
155    rsb           r6, r6, #8
156    sub           r1, r1, r6
157    sub           r2, r2, r6
158    sub           r3, r3, r6, lsl #1
159
160    vld1.8        d0, [r1]!
161    vld1.8        d1, [r2]!
162    vst2.8        {d0, d1}, [r3]!
163
164yuv420sp_uv_row_loop_end_uv:
165    add           r1, r1, r7
166    add           r2, r2, r7
167    add           r3, r3, r8
168    subs          r4, r4, #1
169    bgt           yuv420sp_uv_row_loop_uv
170    @//POP THE REGISTERS
171    ldmfd         sp!, {r4-r12, pc}
172
173
174
175
176
177@ /**
178@ *******************************************************************************
179@ *
180@ * @brief ih264e_fmt_conv_422i_to_420sp_a9q
181@ *     Function used from format conversion or frame copy
182@ *
183@ *
184@ *
185@ *Inputs             : r0 - pu1_y            -   UWORD8 pointer to y plane.
186@ *                     r1 - pu1_u            -   UWORD8 pointer to u plane.
187@ *                     r2 - pu1_v            -   UWORD8 pointer to u plane.
188@ *                     r3 - pu2_yuv422i      -   UWORD16 pointer to yuv422iimage.
189@ *             stack + 40 - u4_width         -   Width of the Y plane.
190@ *                     44 - u4_height        -   Height of the Y plane.
191@ *                     48 - u4_stride_y      -   Stride in pixels of Y plane.
192@ *                     52 - u4_stride_u      -   Stride in pixels of U plane.
193@ *                     56 - u4_stride_v      -   Stride in pixels of V plane.
194@ *                     60 - u4_stride_yuv422i-   Stride in pixels of yuv422i image.
195@ *
196@ * @par   Description
197@ * Function used from copying or converting a reference frame to display buffer
198@ * in non shared mode
199@ *
200@ * @param[in] pu1_y_dst
201@ *   Output Y pointer
202@ *
203@ * @param[in] pu1_u_dst
204@ *   Output U/UV pointer ( UV is interleaved in the same format as that of input)
205@ *
206@ * @param[in] pu1_v_dst
207@ *   Output V pointer ( used in 420P output case)
208@ *
209@ * @param[in] u4_dst_y_strd
210@ *   Stride of destination Y buffer
211@ *
212@ * @param[in] u4_dst_u_strd
213@ *   Stride of destination  U/V buffer
214@ *
215@ *
216@ * @param[in] blocking
217@ *   To indicate whether format conversion should wait till frame is reconstructed
218@ *   and then return after complete copy is done. To be set to 1 when called at the
219@ *   end of frame processing and set to 0 when called between frame processing modules
220@ *   in order to utilize available MCPS
221@ *
222@ * @returns Error from IH264E_ERROR_T
223@ *
224@ * @remarks
225@ * Assumes that the stride of U and V buffers are same.
226@ * This is correct in most cases
227@ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also
228@ * Since we read 4 pixels ata time the width should be aligned to 4
229@ * In assembly width should be aligned to 16 and height to 2.
230@ *
231@ *
232@ * Revision History :
233@ *         DD MM YYYY   Author(s)              Changes (Describe the changes made)
234@ *         07 06 2010   Harinarayanan K K       Adapeted to 422p
235@ *
236@ *******************************************************************************
237@ */
238
239@//`
240@*/
241    .global ih264e_fmt_conv_422i_to_420sp_a9q
242ih264e_fmt_conv_422i_to_420sp_a9q:
243    stmfd         sp!, {r4-r12, lr}     @// Back the register which are used
244
245
246
247    @/* Do the preprocessing before the main loops start */
248    @// Load the parameters from stack
249    ldr           r4, [sp, #48]         @// Load u4_stride_y       from stack
250
251    ldr           r5, [sp, #60]         @// Load u4_stride_yuv422i from stack
252    add           r6, r0, r4            @// pu1_y_nxt_row       = pu1_y + u4_stride_y
253
254    ldr           r7, [sp, #40]         @// Load u4_width          from stack
255    add           r8, r3, r5, lsl #1    @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel)
256
257    ldr           r9, [sp, #52]         @// Load u4_stride_u       from stack
258    sub           r12, r4, r7           @// u2_offset1          = u4_stride_y - u4_width
259
260@LDR            r10,[sp,#56]                ;// Load u4_stride_v       from stack
261    sub           r14, r5, r7           @// u2_offset_yuv422i   = u4_stride_yuv422i - u4_width
262
263    ldr           r11, [sp, #44]        @// Load u4_height         from stack
264    sub           r9, r9, r7            @// u2_offset2          = u4_stride_u - u4_width >> 1
265
266@   SUB         r10,r10,r7,ASR #1           ;// u2_offset3          = u4_stride_v - u4_width >> 1
267    mov           r14, r14, lsl #1      @// u2_offset_yuv422i   = u2_offset_yuv422i * 2
268
269    mov           r11, r11, asr #1      @// u4_width = u4_width / 2 (u4_width >> 1)
270
271    add           r4, r12, r4           @// u2_offset1 = u2_offset1 + u4_stride_y
272    add           r5, r14, r5, lsl #1   @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i
273
274@// Register Assignment
275@// pu1_y               - r0
276@// pu1_y_nxt_row       - r6
277@// pu1_u               - r1
278@// pu1_v               - r2
279@// pu2_yuv422i         - r3
280@// pu2_yuv422i_nxt_row - r8
281@// u2_offset1          - r4
282@// u2_offset2          - r9
283@// u2_offset3          - r10
284@// u2_offset_yuv422i   - r5
285@// u4_width / 16       - r7
286@// u4_height / 2       - r11
287@// inner loop count    - r12
288yuv422i_to_420sp_height_loop:
289
290    mov           r12, r7               @// Inner loop count = u4_width / 16
291
292yuv422i_to_420sp_width_loop:
293    vld4.8        {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
294    vld4.8        {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
295    sub           r12, r12, #16
296
297    vrhadd.u8     d0, d0, d4
298    vrhadd.u8     d2, d2, d6
299
300    vst2.8        {d1, d3}, [r0]!       @// Store the 16 elements of row1 Y
301    vst2.8        {d5, d7}, [r6]!       @// Store the 16 elements of row2 Y
302
303    vst2.8        {d0, d2}, [r1]!       @// Store the 8 elements of row1/2 U
304
305    cmp           r12, #15
306    bgt           yuv422i_to_420sp_width_loop
307    cmp           r12, #0
308    beq           yuv422i_to_420sp_row_loop_end
309
310    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
311    @//Ex if width is 162, above loop will process 160 pixels. And
312    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
313    @// and written using VLD1 and VST1
314    rsb           r12, r12, #16
315    sub           r3, r3, r12, lsl #1
316    sub           r8, r8, r12, lsl #1
317    sub           r0, r0, r12
318    sub           r6, r6, r12
319    sub           r1, r1, r12
320
321    vld4.8        {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
322    vld4.8        {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
323
324    vrhadd.u8     d0, d0, d4
325    vrhadd.u8     d2, d2, d6
326
327    vst2.8        {d1, d3}, [r0]!       @// Store the 16 elements of row1 Y
328    vst2.8        {d5, d7}, [r6]!       @// Store the 16 elements of row2 Y
329
330    vst2.8        {d0, d2}, [r1]!       @// Store the 8 elements of row1/2 U
331
332yuv422i_to_420sp_row_loop_end:
333    @// Update the buffer pointer so that they will refer to next pair of rows
334    add           r0, r0, r4            @// pu1_y               = pu1_y                 + u2_offset1
335    add           r6, r6, r4            @// pu1_y_nxt_row       = pu1_y_nxt_row         + u2_offset1
336
337    add           r1, r1, r9            @// pu1_u               = pu1_u                 + u2_offset2
338    subs          r11, r11, #1
339
340    add           r3, r3, r5            @// pu2_yuv422i         = pu2_yuv422i           + u2_offset_yuv422i
341
342    add           r8, r8, r5            @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row   + u2_offset_yuv422i
343    bgt           yuv422i_to_420sp_height_loop
344    ldmfd         sp!, {r4-r12, pc}     @// Restore the register which are used
345
346
347
348