1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2015 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20 21@/* 22@//---------------------------------------------------------------------------- 23@// File Name : impeg2_format_conv.s 24@// 25@// Description : This file has the Idct Implementations for the 26@// MPEG4 SP decoder on neon platform. 27@// 28@// Reference Document : 29@// 30@// Revision History : 31@// Date Author Detail Description 32@// ------------ ---------------- ---------------------------------- 33@// Jul 07, 2008 Naveen Kumar T Created 34@// 35@//------------------------------------------------------------------------- 36@*/ 37 38@/* 39@// ---------------------------------------------------------------------------- 40@// Include Files 41@// ---------------------------------------------------------------------------- 42@*/ 43.text 44.p2align 2 45.equ log2_16 , 4 46.equ log2_2 , 1 47@/* 48@// ---------------------------------------------------------------------------- 49@// Struct/Union Types and Define 50@// ---------------------------------------------------------------------------- 51@*/ 52 53@/* 54@// ---------------------------------------------------------------------------- 55@// Static Global Data section variables 56@// ---------------------------------------------------------------------------- 57@*/ 58@//--------------------------- NONE -------------------------------------------- 59 60@/* 61@// ---------------------------------------------------------------------------- 62@// Static Prototype Functions 63@// ---------------------------------------------------------------------------- 64@*/ 65@// -------------------------- NONE -------------------------------------------- 66 67@/* 68@// ---------------------------------------------------------------------------- 69@// Exported functions 70@// ---------------------------------------------------------------------------- 71@*/ 72 73@/***************************************************************************** 74@* * 75@* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q() * 76@* * 77@* Description : This function conversts the image from YUV420P color * 78@* space to 420SP color space(UV interleaved). * 79@* * 80@* Arguments : R0 pu1_y * 81@* R1 pu1_u * 82@* R2 pu1_v * 83@* R3 pu1_dest_y * 84@* [R13 #40] pu1_dest_uv * 85@* [R13 #44] u2_height * 86@* [R13 #48] u2_width * 87@* [R13 #52] u2_stridey * 88@* [R13 #56] u2_strideu * 89@* [R13 #60] u2_stridev * 90@* [R13 #64] u2_dest_stride_y * 91@* [R13 #68] u2_dest_stride_uv * 92@* [R13 #72] convert_uv_only * 93@* * 94@* Values Returned : None * 95@* * 96@* Register Usage : R0 - R8, Q0 * 97@* * 98@* Stack Usage : 24 Bytes * 99@* * 100@* Interruptibility : Interruptible * 101@* * 102@* Known Limitations * 103@* Assumptions: Image Width: Assumed to be multiple of 16 and * 104@* greater than or equal to 16 * 105@* Image Height: Assumed to be even. * 106@* * 107@* Revision History : * 108@* DD MM YYYY Author(s) Changes (Describe the changes made) * 109@* 07 06 2010 Varshita Draft * 110@* 07 06 2010 Naveen Kr T Completed * 111@* * 112@*****************************************************************************/ 113 .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q 114impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q: 115 116 @// push the registers on the stack 117 stmfd sp!, {r4-r8, lr} 118 119 ldr r4, [sp, #56] @// Load convert_uv_only 120 121 cmp r4, #1 122 beq yuv420sp_uv_chroma 123 @/* Do the preprocessing before the main loops start */ 124 @// Load the parameters from stack 125 ldr r4, [sp, #28] @// Load u2_height from stack 126 127 ldr r5, [sp, #32] @// Load u2_width from stack 128 129 ldr r7, [sp, #36] @// Load u2_stridey from stack 130 131 ldr r8, [sp, #48] @// Load u2_dest_stride_y from stack 132 133 sub r7, r7, r5 @// Source increment 134 135 sub r8, r8, r5 @// Destination increment 136 137 138yuv420sp_uv_row_loop_y: 139 mov r6, r5 140 141yuv420sp_uv_col_loop_y: 142 pld [r0, #128] 143 vld1.8 {q0}, [r0]! 144 vst1.8 {q0}, [r3]! 145 sub r6, r6, #16 146 cmp r6, #15 147 bgt yuv420sp_uv_col_loop_y 148 149 cmp r6, #0 150 beq yuv420sp_uv_row_loop_end_y 151 @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 152 @//Ex if width is 162, above loop will process 160 pixels. And 153 @//Both source and destination will point to 146th pixel and then 16 bytes will be read 154 @// and written using VLD1 and VST1 155 rsb r6, r6, #16 156 sub r0, r0, r6 157 sub r3, r3, r6 158 159 vld1.8 {q0}, [r0]! 160 vst1.8 {q0}, [r3]! 161 162yuv420sp_uv_row_loop_end_y: 163 add r0, r0, r7 164 add r3, r3, r8 165 subs r4, r4, #1 166 bgt yuv420sp_uv_row_loop_y 167 168yuv420sp_uv_chroma: 169 170 ldr r3, [sp, #24] @// Load pu1_dest_uv from stack 171 172 ldr r4, [sp, #28] @// Load u2_height from stack 173 174 ldr r5, [sp, #32] @// Load u2_width from stack 175 176 177 ldr r7, [sp, #40] @// Load u2_strideu from stack 178 179 ldr r8, [sp, #52] @// Load u2_dest_stride_uv from stack 180 181 sub r7, r7, r5, lsr #1 @// Source increment 182 183 sub r8, r8, r5 @// Destination increment 184 185 mov r5, r5, lsr #1 186 mov r4, r4, lsr #1 187 ldr r3, [sp, #24] @// Load pu1_dest_uv from stack 188yuv420sp_uv_row_loop_uv: 189 mov r6, r5 190 191 192yuv420sp_uv_col_loop_uv: 193 pld [r1, #128] 194 pld [r2, #128] 195 vld1.8 d0, [r1]! 196 vld1.8 d1, [r2]! 197 vst2.8 {d0, d1}, [r3]! 198 sub r6, r6, #8 199 cmp r6, #7 200 bgt yuv420sp_uv_col_loop_uv 201 202 cmp r6, #0 203 beq yuv420sp_uv_row_loop_end_uv 204 @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 205 @//Ex if width is 162, above loop will process 160 pixels. And 206 @//Both source and destination will point to 146th pixel and then 16 bytes will be read 207 @// and written using VLD1 and VST1 208 rsb r6, r6, #8 209 sub r1, r1, r6 210 sub r2, r2, r6 211 sub r3, r3, r6, lsl #1 212 213 vld1.8 d0, [r1]! 214 vld1.8 d1, [r2]! 215 vst2.8 {d0, d1}, [r3]! 216 217yuv420sp_uv_row_loop_end_uv: 218 add r1, r1, r7 219 add r2, r2, r7 220 add r3, r3, r8 221 subs r4, r4, #1 222 bgt yuv420sp_uv_row_loop_uv 223 @//POP THE REGISTERS 224 ldmfd sp!, {r4-r8, pc} 225 226 227 228 229 230@/***************************************************************************** 231@* * 232@* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q() * 233@* * 234@* Description : This function conversts the image from YUV420P color * 235@* space to 420SP color space(VU interleaved). * 236@* This function is similar to above function * 237@* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in * 238@* VLD1.8 for chroma - order of registers is different * 239@* * 240@* Arguments : R0 pu1_y * 241@* R1 pu1_u * 242@* R2 pu1_v * 243@* R3 pu1_dest_y * 244@* [R13 #40] pu1_dest_uv * 245@* [R13 #44] u2_height * 246@* [R13 #48] u2_width * 247@* [R13 #52] u2_stridey * 248@* [R13 #56] u2_strideu * 249@* [R13 #60] u2_stridev * 250@* [R13 #64] u2_dest_stride_y * 251@* [R13 #68] u2_dest_stride_uv * 252@* [R13 #72] convert_uv_only * 253@* * 254@* Values Returned : None * 255@* * 256@* Register Usage : R0 - R8, Q0 * 257@* * 258@* Stack Usage : 24 Bytes * 259@* * 260@* Interruptibility : Interruptible * 261@* * 262@* Known Limitations * 263@* Assumptions: Image Width: Assumed to be multiple of 16 and * 264@* greater than or equal to 16 * 265@* Image Height: Assumed to be even. * 266@* * 267@* Revision History : * 268@* DD MM YYYY Author(s) Changes (Describe the changes made) * 269@* 07 06 2010 Varshita Draft * 270@* 07 06 2010 Naveen Kr T Completed * 271@* * 272@*****************************************************************************/ 273 274 .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q 275impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q: 276 277 @// push the registers on the stack 278 stmfd sp!, {r4-r8, lr} 279 280 ldr r4, [sp, #56] @// Load convert_uv_only 281 282 cmp r4, #1 283 beq yuv420sp_vu_chroma 284 285 @/* Do the preprocessing before the main loops start */ 286 @// Load the parameters from stack 287 ldr r4, [sp, #28] @// Load u2_height from stack 288 289 ldr r5, [sp, #32] @// Load u2_width from stack 290 291 ldr r7, [sp, #36] @// Load u2_stridey from stack 292 293 ldr r8, [sp, #48] @// Load u2_dest_stride_y from stack 294 295 sub r7, r7, r5 @// Source increment 296 297 sub r8, r8, r5 @// Destination increment 298 299 300yuv420sp_vu_row_loop_y: 301 mov r6, r5 302 303yuv420sp_vu_col_loop_y: 304 pld [r0, #128] 305 vld1.8 {q0}, [r0]! 306 vst1.8 {q0}, [r3]! 307 sub r6, r6, #16 308 cmp r6, #15 309 bgt yuv420sp_vu_col_loop_y 310 311 cmp r6, #0 312 beq yuv420sp_vu_row_loop_end_y 313 @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 314 @//Ex if width is 162, above loop will process 160 pixels. And 315 @//Both source and destination will point to 146th pixel and then 16 bytes will be read 316 @// and written using VLD1 and VST1 317 rsb r6, r6, #16 318 sub r0, r0, r6 319 sub r3, r3, r6 320 321 vld1.8 {q0}, [r0]! 322 vst1.8 {q0}, [r3]! 323 324yuv420sp_vu_row_loop_end_y: 325 add r0, r0, r7 326 add r3, r3, r8 327 subs r4, r4, #1 328 bgt yuv420sp_vu_row_loop_y 329 330yuv420sp_vu_chroma: 331 332 ldr r3, [sp, #24] @// Load pu1_dest_uv from stack 333 334 ldr r4, [sp, #28] @// Load u2_height from stack 335 336 ldr r5, [sp, #32] @// Load u2_width from stack 337 338 339 ldr r7, [sp, #40] @// Load u2_strideu from stack 340 341 ldr r8, [sp, #52] @// Load u2_dest_stride_uv from stack 342 343 sub r7, r7, r5, lsr #1 @// Source increment 344 345 sub r8, r8, r5 @// Destination increment 346 347 mov r5, r5, lsr #1 348 mov r4, r4, lsr #1 349 ldr r3, [sp, #24] @// Load pu1_dest_uv from stack 350yuv420sp_vu_row_loop_uv: 351 mov r6, r5 352 353 354yuv420sp_vu_col_loop_uv: 355 pld [r1, #128] 356 pld [r2, #128] 357 vld1.8 d1, [r1]! 358 vld1.8 d0, [r2]! 359 vst2.8 {d0, d1}, [r3]! 360 sub r6, r6, #8 361 cmp r6, #7 362 bgt yuv420sp_vu_col_loop_uv 363 364 cmp r6, #0 365 beq yuv420sp_vu_row_loop_end_uv 366 @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 367 @//Ex if width is 162, above loop will process 160 pixels. And 368 @//Both source and destination will point to 146th pixel and then 16 bytes will be read 369 @// and written using VLD1 and VST1 370 rsb r6, r6, #8 371 sub r1, r1, r6 372 sub r2, r2, r6 373 sub r3, r3, r6, lsl #1 374 375 vld1.8 d1, [r1]! 376 vld1.8 d0, [r2]! 377 vst2.8 {d0, d1}, [r3]! 378 379yuv420sp_vu_row_loop_end_uv: 380 add r1, r1, r7 381 add r2, r2, r7 382 add r3, r3, r8 383 subs r4, r4, #1 384 bgt yuv420sp_vu_row_loop_uv 385 @//POP THE REGISTERS 386 ldmfd sp!, {r4-r8, pc} 387 388 389 390 391 392