1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@/*******************************************************************************
20@* @file
21@*  ihevcd_fmt_conv_420sp_to_420p.s
22@*
23@* @brief
24@*  contains function definitions for format conversions
25@*
26@* @author
27@*  ittiam
28@*
29@* @par list of functions:
30@*
31@*
32@* @remarks
33@*  none
34@*
35@*******************************************************************************/
36
37
38
39
40
41
42
43
44.text
45
46
47
48
49
50@/*****************************************************************************
51@*                                                                            *
52@*  Function Name    : neon_copy_yuv420sp_to_yuv420p()                       *
53@*                                                                            *
54@*  Description      : This function conversts the image from YUV420sP color  *
55@*                     space to 420SP color space(UV interleaved).            *
56@*                                                                            *
57@*  Arguments        : R0           pu1_src_y                                 *
58@*                     R1           pu1_src_uv                                *
59@*                     R2           pu1_dest_y                                *
60@*                     R3           pu1_dest_u                               *
61@*                     [R13 #40]    pu1_dest_v                               *
62@*                     [R13 #44]    u2_width                                 *
63@*                     [R13 #48]    u2_height                                   *
64@*                     [R13 #52]    u2_stridey                                *
65@*                     [R13 #56]    u2_strideuv                               *
66@*                     [R13 #60]    u2_dest_stridey                           *
67@*                     [R13 #64]    u2_dest_strideuv                          *
68@*                     [R13 #68]    is_u_first                                *
69@*                     [R13 #72]    disable_luma_copy                         *
70@*                                                                            *
71@*  Values Returned  : None                                                   *
72@*                                                                            *
73@*  Register Usage   : R0 - R14                                               *
74@*                                                                            *
75@*  Stack Usage      : 40 Bytes                                               *
76@*                                                                            *
77@*  Interruptibility : Interruptible                                          *
78@*                                                                            *
79@*  Known Limitations                                                         *
80@*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
81@*                     Image Height:    Assumed to be even.                   *
82@*                                                                            *
83@*  Revision History :                                                        *
84@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
85@*         16 05 2012   Naveen SR     draft                                   *
86@*                                                                            *
87@*****************************************************************************/
88
89.globl ihevcd_fmt_conv_420sp_to_420p_a9q
90
91.type ihevcd_fmt_conv_420sp_to_420p_a9q, %function
92
93ihevcd_fmt_conv_420sp_to_420p_a9q:
94    STMFD       sp!,{r4-r12, lr}
95
96    LDR         r5,[sp,#60]                 @//Load u2_dest_stridey
97@   LDR     r6,[sp,#56]             @//Load u2_strideuv
98    LDR         r7,[sp,#52]                 @//Load u2_stridey
99    LDR         r8,[sp,#44]                 @//Load u2_width
100    LDR         r9,[sp,#48]                 @//Load u2_height
101
102    SUB         r10,r7,r8                   @// Src Y increment
103    SUB         r11,r5,r8                   @// Dst Y increment
104
105    LDR         r5,[sp,#72]                 @//Load disable_luma_copy flag
106    CMP         r5,#0                       @//skip luma if disable_luma_copy is non-zero
107    BNE         uv_copy_start
108
109    @/* Copy Y */
110
111    MOV         r4,r9                       @// Copying height
112y_row_loop:
113    MOV         r6,r8                       @// Copying width
114
115y_col_loop:
116
117    SUB         r6,r6,#16
118    vld1.8      {d0,d1},[r0]!
119    vst1.8      {d0,d1},[r2]!
120    CMP         r6,#16
121    BGE         y_col_loop
122    CMP         r6,#0
123    BEQ         y_col_loop_end
124    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
125    @//Ex if width is 162, above loop will process 160 pixels. And
126    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
127    @// and written using VLD1 and VST1
128    RSB         r6,r6,#16
129    SUB         r0,r0,r6
130    SUB         r2,r2,r6
131    vld1.8      {d0,d1}, [r0]!
132    vst1.8      {d0,d1}, [r2]!
133
134y_col_loop_end:
135    ADD         r0, r0, r10
136    ADD         r2, r2, r11
137    SUBS        r4, r4, #1
138    BGT         y_row_loop
139
140
141    @/* Copy UV */
142uv_copy_start:
143
144    LDR         r5,[sp,#64]                 @//Load u2_dest_strideuv
145    LDR         r7,[sp,#56]                 @//Load u2_strideuv
146
147    MOV         r9,r9,LSR #1                @// height/2
148@   MOV     r8,r8,LSR #1            @// Width/2
149
150    SUB         r10,r7,r8                   @// Src UV increment
151    MOV         r11,r8,LSR #1
152    SUB         r11,r5,r11                  @// Dst U and V increment
153
154    LDR         r5,[sp,#40]                 @//Load pu1_dest_v
155
156    LDR         r4,[sp,#68]                 @//Load is_u_first_flag
157    CMP         r4,#0                       @//Swap U and V dest if is_u_first_flag is zero
158    MOVEQ       r4,r5
159    MOVEQ       r5,r3
160    MOVEQ       r3,r4
161
162    MOV         r4,r9                       @// Copying height
163uv_row_loop:
164    MOV         r6,r8                       @// Copying width
165
166uv_col_loop:
167
168    SUB         r6,r6,#16
169
170    PLD         [r1,#128]
171    vld2.8      {d0,d1},[r1]!
172    VST1.8      D0,[r3]!
173    VST1.8      D1,[r5]!
174    CMP         r6,#16
175    BGE         uv_col_loop
176    CMP         r6,#0
177    BEQ         uv_col_loop_end
178    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
179    @//Ex if width is 162, above loop will process 160 pixels. And
180    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
181    @// and written using VLD1 and VST1
182    RSB         r6,r6,#16
183    SUB         r1,r1,r6
184    SUB         r3,r3,r6,LSR #1
185    SUB         r5,r5,r6,LSR #1
186    vld2.8      {d0,d1}, [r1]!
187    VST1.8      D0, [r3]!
188    VST1.8      D1, [r5]!
189uv_col_loop_end:
190    ADD         r1, r1, r10
191    ADD         r3, r3, r11
192    ADD         r5, r5, r11
193    SUBS        r4, r4, #1
194    BGT         uv_row_loop
195
196exit:
197    LDMFD       sp!,{r4-r12, pc}
198
199
200
201
202
203
204