1;// 2;// 3;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 9641 6;// Date: Thursday, February 7, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12;// Description: 13;// H.264 inverse quantize and transform module 14;// 15;// 16 17;// Include standard headers 18 19 INCLUDE omxtypes_s.h 20 INCLUDE armCOMM_s.h 21 22;// Import/Export symbols required from/to other files 23;// (For example tables) 24 25 IMPORT armVCM4P10_UnpackBlock4x4 26 IMPORT armVCM4P10_QPDivTable 27 IMPORT armVCM4P10_VMatrixQPModTable 28 29 M_VARIANTS ARM1136JS 30 31;// Set debugging level 32;//DEBUG_ON SETL {TRUE} 33 34 35;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 36 37 38;// Guarding implementation by the processor name 39 40 IF ARM1136JS 41 42 43;//Input Registers 44pData RN 0 45QP RN 1 46 47;//Output Registers 48 49 50;//Local Scratch Registers 51 52;// Packed Input pixels 53in00 RN 2 ;// Src[0] & Src[1] 54in02 RN 3 ;// Src[2] & Src[3] 55in10 RN 4 ;// Src[4] & Src[5] 56in12 RN 5 ;// Src[6] & Src[7] 57in20 RN 6 ;// Src[8] & Src[9] 58in22 RN 7 ;// Src[10] & Src[11] 59in30 RN 8 ;// Src[12] & Src[13] 60in32 RN 9 ;// Src[14] & Src[15] 61 62;// Transpose for Row operations (Rows to cols) 63trRow00 RN 2 64trRow10 RN 10 65trRow02 RN 3 66trRow12 RN 5 67trRow20 RN 11 68trRow30 RN 12 69trRow32 RN 14 70trRow22 RN 7 71 72;// Intermediate calculations 73rowSum1 RN 4 74rowSum2 RN 6 75rowDiff1 RN 8 76rowDiff2 RN 9 77 78 79;// Row operated pixels 80rowOp00 RN 2 81rowOp10 RN 10 82rowOp20 RN 11 83rowOp30 RN 12 84rowOp02 RN 3 85rowOp12 RN 5 86rowOp22 RN 7 87rowOp32 RN 14 88 89;// Transpose for colulmn operations 90trCol00 RN 2 91trCol02 RN 3 92trCol10 RN 4 93trCol12 RN 5 94trCol20 RN 6 95trCol22 RN 7 96trCol30 RN 8 97trCol32 RN 9 98 99;// Intermediate calculations 100colSum1 RN 10 101colSum2 RN 11 102colDiff1 RN 12 103colDiff2 RN 14 104 105 106;// Coloumn operated pixels 107colOp00 RN 2 108colOp02 RN 3 109colOp10 RN 4 110colOp12 RN 5 111colOp20 RN 6 112colOp22 RN 7 113colOp30 RN 8 114colOp32 RN 9 115 116;// Temporary scratch varaibles 117pQPDivTable RN 0 118pQPModTable RN 11 119Shift RN 10 120Scale RN 14 121Round RN 0 122 123temp1 RN 10 124temp2 RN 11 125temp3 RN 12 126temp4 RN 1 127 128 129 130;// InvTransformed and Dequantized pixels 131out00 RN 2 132out02 RN 3 133out10 RN 4 134out12 RN 5 135out20 RN 6 136out22 RN 7 137out30 RN 8 138out32 RN 9 139 140 141 142 143 ;// Allocate stack memory required by the function 144 M_ALLOC4 pDataOnStack, 4 145 146 ;// Write function header 147 M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11 148 149 ;****************************************************************** 150 ;// The strategy used in implementing the transform is as follows:* 151 ;// Load the 4x4 block into 8 registers * 152 ;// Transpose the 4x4 matrix * 153 ;// Perform the row operations (on columns) using SIMD * 154 ;// Transpose the 4x4 result matrix * 155 ;// Perform the coloumn operations * 156 ;// Store the 4x4 block at one go * 157 ;****************************************************************** 158 159 ;// Load all the 4x4 pixels 160 161 LDMIA pData,{in00,in02,in10,in12,in20,in22,in30,in32} 162 163 ;//***************************************************************** 164 ;// 165 ;// Transpose the matrix inorder to perform row ops as coloumn ops 166 ;// Input: in[][] = original matrix 167 ;// Output: trRow[][]= transposed matrix 168 ;// Step1: Obtain the LL part of the transposed matrix 169 ;// Step2: Obtain the HL part 170 ;// step3: Obtain the LH part 171 ;// Step4: Obtain the HH part 172 ;// 173 ;//***************************************************************** 174 175 ;// LL 2x2 transposed matrix 176 ;// d0 d1 - - 177 ;// d4 d5 - - 178 ;// - - - - 179 ;// - - - - 180 181 PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1] 182 PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0] 183 184 ;// HL 2x2 transposed matrix 185 ;// - - - - 186 ;// - - - - 187 ;// d8 d9 - - 188 ;// d12 d13 - - 189 190 191 PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3] 192 PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2] 193 194 ;// LH 2x2 transposed matrix 195 ;// - - d2 d3 196 ;// - - d6 d7 197 ;// - - - - 198 ;// - - - - 199 200 PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8] 201 PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9] 202 203 204 205 206 ;// HH 2x2 transposed matrix 207 ;// - - - - 208 ;// - - - - 209 ;// - - d10 d11 210 ;// - - d14 d15 211 212 PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11] 213 PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10] 214 215 216 ;**************************************** 217 ;// Row Operations (Performed on columns) 218 ;**************************************** 219 220 221 ;// SIMD operations on first two columns(two rows of the original matrix) 222 223 SADD16 rowSum1,trRow00,trRow10 ;// (c0+c1) 224 SADD16 rowSum2,trRow20,trRow30 ;// (c2+c3) 225 SSUB16 rowDiff1,trRow00,trRow10 ;// (c0-c1) 226 SSUB16 rowDiff2,trRow20,trRow30 ;// (c2-c3) 227 SADD16 rowOp00,rowSum1,rowSum2 ;// (c0+c1+c2+c3) 228 SSUB16 rowOp10,rowSum1,rowSum2 ;// (c0+c1-c2-c3) 229 SSUB16 rowOp20,rowDiff1,rowDiff2 ;// (c0-c1-c2+c3) 230 SADD16 rowOp30,rowDiff1,rowDiff2 ;// (c0-c1+c2-c3) 231 232 233 ;// SIMD operations on next two columns(next two rows of the original matrix) 234 235 SADD16 rowSum1,trRow02,trRow12 ;// (c0+c1) 236 SADD16 rowSum2,trRow22,trRow32 ;// (c2+c3) 237 SSUB16 rowDiff1,trRow02,trRow12 ;// (c0-c1) 238 SSUB16 rowDiff2,trRow22,trRow32 ;// (c2-c3) 239 SADD16 rowOp02,rowSum1,rowSum2 ;// (c0+c1+c2+c3) 240 SSUB16 rowOp12,rowSum1,rowSum2 ;// (c0+c1-c2-c3) 241 SSUB16 rowOp22,rowDiff1,rowDiff2 ;// (c0-c1-c2+c3) 242 SADD16 rowOp32,rowDiff1,rowDiff2 ;// (c0-c1+c2-c3) 243 244 245 246 ;***************************************************************** 247 ;// Transpose the resultant matrix 248 ;// Input: rowOp[][] 249 ;// Output: trCol[][] 250 ;***************************************************************** 251 252 ;// LL 2x2 transposed matrix 253 ;// d0 d1 - - 254 ;// d4 d5 - - 255 ;// - - - - 256 ;// - - - - 257 258 PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1] 259 PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0] 260 261 ;// HL 2x2 transposed matrix 262 ;// - - - - 263 ;// - - - - 264 ;// d8 d9 - - 265 ;// d12 d13 - - 266 267 268 PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3] 269 PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2] 270 271 ;// LH 2x2 transposed matrix 272 ;// - - d2 d3 273 ;// - - d6 d7 274 ;// - - - - 275 ;// - - - - 276 277 PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8] 278 PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9] 279 280 281 282 283 ;// HH 2x2 transposed matrix 284 ;// - - - - 285 ;// - - - - 286 ;// - - d10 d11 287 ;// - - d14 d15 288 289 PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11] 290 PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10] 291 292 293 ;******************************* 294 ;// Coloumn Operations 295 ;******************************* 296 297 ;//-------------------------------------------------------------------------------------- 298 ;// Store pData(RN0) on stack and restore it only at the final store back 299 ;// This frees up a register (RN0) which is used to reduce number of intermediate stalls 300 ;//-------------------------------------------------------------------------------------- 301 M_STR pData,pDataOnStack 302 303 304 ;// SIMD operations on first two columns(two rows of the original matrix) 305 306 SADD16 colSum1,trCol00,trCol10 ;// (c0+c1) 307 SADD16 colSum2,trCol20,trCol30 ;// (c2+c3) 308 SSUB16 colDiff1,trCol00,trCol10 ;// (c0-c1) 309 SSUB16 colDiff2,trCol20,trCol30 ;// (c2-c3) 310 SADD16 colOp00,colSum1,colSum2 ;// (c0+c1+c2+c3) 311 SSUB16 colOp10,colSum1,colSum2 ;// (c0+c1-c2-c3) 312 SSUB16 colOp20,colDiff1,colDiff2 ;// (c0-c1-c2+c3) 313 SADD16 colOp30,colDiff1,colDiff2 ;// (c0-c1+c2-c3) 314 315 316 ;// SIMD operations on next two columns(next two rows of the original matrix) 317 318 LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer 319 SADD16 colSum1,trCol02,trCol12 ;// (c0+c1) 320 SADD16 colSum2,trCol22,trCol32 ;// (c2+c3) 321 SSUB16 colDiff1,trCol02,trCol12 ;// (c0-c1) 322 SSUB16 colDiff2,trCol22,trCol32 ;// (c2-c3) 323 SADD16 colOp02,colSum1,colSum2 ;// (c0+c1+c2+c3) 324 SSUB16 colOp12,colSum1,colSum2 ;// (c0+c1-c2-c3) 325 LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer 326 LDRSB Shift, [pQPDivTable, QP] ;// Shift = pQPDivTable[QP] 327 SSUB16 colOp22,colDiff1,colDiff2 ;// (c0-c1-c2+c3) 328 SADD16 colOp32,colDiff1,colDiff2 ;// (c0-c1+c2-c3) 329 330 331 LDRSB Scale, [pQPModTable, QP] ;// Scale = pQPModTable[QP] 332 333 ;//---------------------------------------------------------------------- 334 ;// 335 ;// <Dequantize> improves on the c-reference code 336 ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together 337 ;// We do not subtract 2 from Shift as in C reference, instead perform a 338 ;// Scale << Shift once in the beginning and do a right shift by a 339 ;// constant 2 after the Multiplication. The value of Round would be 2 340 ;// 341 ;// By doing this we aviod the Branches required and also 342 ;// reduce the code size substantially 343 ;// 344 ;//---------------------------------------------------------------------- 345 346 MOV Round, #2 ;// Round = 2 347 LSL Scale, Scale, Shift ;// Scale = Scale << Shift 348 349 350 ;// Row 1 351 SMLABB temp1, colOp00, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 352 SMLABB temp3, colOp02, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 353 SMLATB temp2, colOp00, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 354 SMLATB temp4, colOp02, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 355 356 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 357 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 358 PKHBT out00, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 359 PKHBT out02, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 360 361 362 ;// Row 2 363 SMLABB temp1, colOp10, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 364 SMLABB temp3, colOp12, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 365 SMLATB temp2, colOp10, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 366 SMLATB temp4, colOp12, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 367 368 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 369 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 370 PKHBT out10, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 371 PKHBT out12, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 372 373 ;// Row 3 374 SMLABB temp1, colOp20, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 375 SMLABB temp3, colOp22, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 376 SMLATB temp2, colOp20, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 377 SMLATB temp4, colOp22, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 378 379 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 380 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 381 PKHBT out20, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 382 PKHBT out22, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 383 384 ;// Row 4 385 SMLABB temp1, colOp30, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 386 SMLABB temp3, colOp32, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 387 SMLATB temp2, colOp30, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 388 SMLATB temp4, colOp32, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 389 390 M_LDR pData,pDataOnStack ;// Restore pData pointer from stack 391 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 392 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 393 PKHBT out30, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 394 PKHBT out32, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 395 396 397 398 ;*************************** 399 ;// Store all the 4x4 pixels 400 ;*************************** 401 402store_coeff 403 404 STMIA pData,{out00,out02,out10,out12,out20,out22,out30,out32} 405 406 407 408 ;// Set return value 409 410 411 ;// Write function tail 412 M_END 413 414 ENDIF ;//ARM1136JS 415 416 417;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 418 419;// Guarding implementation by the processor name 420 421 422 423 424;// Function: omxVCM4P10_TransformDequantLumaDCFromPair 425 426;//Input Registers 427ppSrc RN 0 428pDst RN 1 429QPR2 RN 2 430 431;//Output Registers 432result RN 0 433 434;//Local Scratch Registers 435pDstR4 RN 4 436pDstR0 RN 0 437QPR1 RN 1 438QPR5 RN 5 439 440;// Guarding implementation by the processor name 441 442 IF ARM1136JS 443 444 ;// Allocate stack memory required by the function 445 446 447 ;// Write function header 448 M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5 449 450 MOV pDstR4,pDst ;// Saving register r1 451 MOV QPR5,QPR2 ;// Saving register r2 452 BL armVCM4P10_UnpackBlock4x4 453 454 MOV pDstR0,pDstR4 ;// Setting up register r0 455 MOV QPR1,QPR5 ;// Setting up register r1 456 BL armVCM4P10_InvTransformDequantLumaDC4x4 457 458 459 ;// Set return value 460 MOV result,#OMX_Sts_NoErr 461 462 ;// Write function tail 463 M_END 464 465 466 ENDIF ;//ARM1136JS 467 468 469 END