omxVCM4P10_TransformDequantLumaDCFromPair_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 9641 21;// Date: Thursday, February 7, 2008 22;// 23;// 24;// 25;// 26;// Description: 27;// H.264 inverse quantize and transform module 28;// 29;// 30 31;// Include standard headers 32 33 INCLUDE omxtypes_s.h 34 INCLUDE armCOMM_s.h 35 36;// Import/Export symbols required from/to other files 37;// (For example tables) 38 39 IMPORT armVCM4P10_UnpackBlock4x4 40 IMPORT armVCM4P10_QPDivTable 41 IMPORT armVCM4P10_VMatrixQPModTable 42 43 M_VARIANTS ARM1136JS 44 45;// Set debugging level 46;//DEBUG_ON SETL {TRUE} 47 48 49;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 50 51 52;// Guarding implementation by the processor name 53 54 IF ARM1136JS 55 56 57;//Input Registers 58pData RN 0 59QP RN 1 60 61;//Output Registers 62 63 64;//Local Scratch Registers 65 66;// Packed Input pixels 67in00 RN 2 ;// Src[0] & Src[1] 68in02 RN 3 ;// Src[2] & Src[3] 69in10 RN 4 ;// Src[4] & Src[5] 70in12 RN 5 ;// Src[6] & Src[7] 71in20 RN 6 ;// Src[8] & Src[9] 72in22 RN 7 ;// Src[10] & Src[11] 73in30 RN 8 ;// Src[12] & Src[13] 74in32 RN 9 ;// Src[14] & Src[15] 75 76;// Transpose for Row operations (Rows to cols) 77trRow00 RN 2 78trRow10 RN 10 79trRow02 RN 3 80trRow12 RN 5 81trRow20 RN 11 82trRow30 RN 12 83trRow32 RN 14 84trRow22 RN 7 85 86;// Intermediate calculations 87rowSum1 RN 4 88rowSum2 RN 6 89rowDiff1 RN 8 90rowDiff2 RN 9 91 92 93;// Row operated pixels 94rowOp00 RN 2 95rowOp10 RN 10 96rowOp20 RN 11 97rowOp30 RN 12 98rowOp02 RN 3 99rowOp12 RN 5 100rowOp22 RN 7 101rowOp32 RN 14 102 103;// Transpose for colulmn operations 104trCol00 RN 2 105trCol02 RN 3 106trCol10 RN 4 107trCol12 RN 5 108trCol20 RN 6 109trCol22 RN 7 110trCol30 RN 8 111trCol32 RN 9 112 113;// Intermediate calculations 114colSum1 RN 10 115colSum2 RN 11 116colDiff1 RN 12 117colDiff2 RN 14 118 119 120;// Coloumn operated pixels 121colOp00 RN 2 122colOp02 RN 3 123colOp10 RN 4 124colOp12 RN 5 125colOp20 RN 6 126colOp22 RN 7 127colOp30 RN 8 128colOp32 RN 9 129 130;// Temporary scratch varaibles 131pQPDivTable RN 0 132pQPModTable RN 11 133Shift RN 10 134Scale RN 14 135Round RN 0 136 137temp1 RN 10 138temp2 RN 11 139temp3 RN 12 140temp4 RN 1 141 142 143 144;// InvTransformed and Dequantized pixels 145out00 RN 2 146out02 RN 3 147out10 RN 4 148out12 RN 5 149out20 RN 6 150out22 RN 7 151out30 RN 8 152out32 RN 9 153 154 155 156 157 ;// Allocate stack memory required by the function 158 M_ALLOC4 pDataOnStack, 4 159 160 ;// Write function header 161 M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11 162 163 ;****************************************************************** 164 ;// The strategy used in implementing the transform is as follows:* 165 ;// Load the 4x4 block into 8 registers * 166 ;// Transpose the 4x4 matrix * 167 ;// Perform the row operations (on columns) using SIMD * 168 ;// Transpose the 4x4 result matrix * 169 ;// Perform the coloumn operations * 170 ;// Store the 4x4 block at one go * 171 ;****************************************************************** 172 173 ;// Load all the 4x4 pixels 174 175 LDMIA pData,{in00,in02,in10,in12,in20,in22,in30,in32} 176 177 ;//***************************************************************** 178 ;// 179 ;// Transpose the matrix inorder to perform row ops as coloumn ops 180 ;// Input: in[][] = original matrix 181 ;// Output: trRow[][]= transposed matrix 182 ;// Step1: Obtain the LL part of the transposed matrix 183 ;// Step2: Obtain the HL part 184 ;// step3: Obtain the LH part 185 ;// Step4: Obtain the HH part 186 ;// 187 ;//***************************************************************** 188 189 ;// LL 2x2 transposed matrix 190 ;// d0 d1 - - 191 ;// d4 d5 - - 192 ;// - - - - 193 ;// - - - - 194 195 PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1] 196 PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0] 197 198 ;// HL 2x2 transposed matrix 199 ;// - - - - 200 ;// - - - - 201 ;// d8 d9 - - 202 ;// d12 d13 - - 203 204 205 PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3] 206 PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2] 207 208 ;// LH 2x2 transposed matrix 209 ;// - - d2 d3 210 ;// - - d6 d7 211 ;// - - - - 212 ;// - - - - 213 214 PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8] 215 PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9] 216 217 218 219 220 ;// HH 2x2 transposed matrix 221 ;// - - - - 222 ;// - - - - 223 ;// - - d10 d11 224 ;// - - d14 d15 225 226 PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11] 227 PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10] 228 229 230 ;**************************************** 231 ;// Row Operations (Performed on columns) 232 ;**************************************** 233 234 235 ;// SIMD operations on first two columns(two rows of the original matrix) 236 237 SADD16 rowSum1,trRow00,trRow10 ;// (c0+c1) 238 SADD16 rowSum2,trRow20,trRow30 ;// (c2+c3) 239 SSUB16 rowDiff1,trRow00,trRow10 ;// (c0-c1) 240 SSUB16 rowDiff2,trRow20,trRow30 ;// (c2-c3) 241 SADD16 rowOp00,rowSum1,rowSum2 ;// (c0+c1+c2+c3) 242 SSUB16 rowOp10,rowSum1,rowSum2 ;// (c0+c1-c2-c3) 243 SSUB16 rowOp20,rowDiff1,rowDiff2 ;// (c0-c1-c2+c3) 244 SADD16 rowOp30,rowDiff1,rowDiff2 ;// (c0-c1+c2-c3) 245 246 247 ;// SIMD operations on next two columns(next two rows of the original matrix) 248 249 SADD16 rowSum1,trRow02,trRow12 ;// (c0+c1) 250 SADD16 rowSum2,trRow22,trRow32 ;// (c2+c3) 251 SSUB16 rowDiff1,trRow02,trRow12 ;// (c0-c1) 252 SSUB16 rowDiff2,trRow22,trRow32 ;// (c2-c3) 253 SADD16 rowOp02,rowSum1,rowSum2 ;// (c0+c1+c2+c3) 254 SSUB16 rowOp12,rowSum1,rowSum2 ;// (c0+c1-c2-c3) 255 SSUB16 rowOp22,rowDiff1,rowDiff2 ;// (c0-c1-c2+c3) 256 SADD16 rowOp32,rowDiff1,rowDiff2 ;// (c0-c1+c2-c3) 257 258 259 260 ;***************************************************************** 261 ;// Transpose the resultant matrix 262 ;// Input: rowOp[][] 263 ;// Output: trCol[][] 264 ;***************************************************************** 265 266 ;// LL 2x2 transposed matrix 267 ;// d0 d1 - - 268 ;// d4 d5 - - 269 ;// - - - - 270 ;// - - - - 271 272 PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1] 273 PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0] 274 275 ;// HL 2x2 transposed matrix 276 ;// - - - - 277 ;// - - - - 278 ;// d8 d9 - - 279 ;// d12 d13 - - 280 281 282 PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3] 283 PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2] 284 285 ;// LH 2x2 transposed matrix 286 ;// - - d2 d3 287 ;// - - d6 d7 288 ;// - - - - 289 ;// - - - - 290 291 PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8] 292 PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9] 293 294 295 296 297 ;// HH 2x2 transposed matrix 298 ;// - - - - 299 ;// - - - - 300 ;// - - d10 d11 301 ;// - - d14 d15 302 303 PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11] 304 PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10] 305 306 307 ;******************************* 308 ;// Coloumn Operations 309 ;******************************* 310 311 ;//-------------------------------------------------------------------------------------- 312 ;// Store pData(RN0) on stack and restore it only at the final store back 313 ;// This frees up a register (RN0) which is used to reduce number of intermediate stalls 314 ;//-------------------------------------------------------------------------------------- 315 M_STR pData,pDataOnStack 316 317 318 ;// SIMD operations on first two columns(two rows of the original matrix) 319 320 SADD16 colSum1,trCol00,trCol10 ;// (c0+c1) 321 SADD16 colSum2,trCol20,trCol30 ;// (c2+c3) 322 SSUB16 colDiff1,trCol00,trCol10 ;// (c0-c1) 323 SSUB16 colDiff2,trCol20,trCol30 ;// (c2-c3) 324 SADD16 colOp00,colSum1,colSum2 ;// (c0+c1+c2+c3) 325 SSUB16 colOp10,colSum1,colSum2 ;// (c0+c1-c2-c3) 326 SSUB16 colOp20,colDiff1,colDiff2 ;// (c0-c1-c2+c3) 327 SADD16 colOp30,colDiff1,colDiff2 ;// (c0-c1+c2-c3) 328 329 330 ;// SIMD operations on next two columns(next two rows of the original matrix) 331 332 LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer 333 SADD16 colSum1,trCol02,trCol12 ;// (c0+c1) 334 SADD16 colSum2,trCol22,trCol32 ;// (c2+c3) 335 SSUB16 colDiff1,trCol02,trCol12 ;// (c0-c1) 336 SSUB16 colDiff2,trCol22,trCol32 ;// (c2-c3) 337 SADD16 colOp02,colSum1,colSum2 ;// (c0+c1+c2+c3) 338 SSUB16 colOp12,colSum1,colSum2 ;// (c0+c1-c2-c3) 339 LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer 340 LDRSB Shift, [pQPDivTable, QP] ;// Shift = pQPDivTable[QP] 341 SSUB16 colOp22,colDiff1,colDiff2 ;// (c0-c1-c2+c3) 342 SADD16 colOp32,colDiff1,colDiff2 ;// (c0-c1+c2-c3) 343 344 345 LDRSB Scale, [pQPModTable, QP] ;// Scale = pQPModTable[QP] 346 347 ;//---------------------------------------------------------------------- 348 ;// 349 ;// <Dequantize> improves on the c-reference code 350 ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together 351 ;// We do not subtract 2 from Shift as in C reference, instead perform a 352 ;// Scale << Shift once in the beginning and do a right shift by a 353 ;// constant 2 after the Multiplication. The value of Round would be 2 354 ;// 355 ;// By doing this we aviod the Branches required and also 356 ;// reduce the code size substantially 357 ;// 358 ;//---------------------------------------------------------------------- 359 360 MOV Round, #2 ;// Round = 2 361 LSL Scale, Scale, Shift ;// Scale = Scale << Shift 362 363 364 ;// Row 1 365 SMLABB temp1, colOp00, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 366 SMLABB temp3, colOp02, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 367 SMLATB temp2, colOp00, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 368 SMLATB temp4, colOp02, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 369 370 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 371 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 372 PKHBT out00, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 373 PKHBT out02, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 374 375 376 ;// Row 2 377 SMLABB temp1, colOp10, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 378 SMLABB temp3, colOp12, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 379 SMLATB temp2, colOp10, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 380 SMLATB temp4, colOp12, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 381 382 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 383 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 384 PKHBT out10, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 385 PKHBT out12, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 386 387 ;// Row 3 388 SMLABB temp1, colOp20, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 389 SMLABB temp3, colOp22, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 390 SMLATB temp2, colOp20, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 391 SMLATB temp4, colOp22, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 392 393 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 394 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 395 PKHBT out20, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 396 PKHBT out22, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 397 398 ;// Row 4 399 SMLABB temp1, colOp30, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 400 SMLABB temp3, colOp32, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 401 SMLATB temp2, colOp30, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 402 SMLATB temp4, colOp32, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 403 404 M_LDR pData,pDataOnStack ;// Restore pData pointer from stack 405 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 406 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 407 PKHBT out30, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 408 PKHBT out32, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 409 410 411 412 ;*************************** 413 ;// Store all the 4x4 pixels 414 ;*************************** 415 416store_coeff 417 418 STMIA pData,{out00,out02,out10,out12,out20,out22,out30,out32} 419 420 421 422 ;// Set return value 423 424 425 ;// Write function tail 426 M_END 427 428 ENDIF ;//ARM1136JS 429 430 431;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 432 433;// Guarding implementation by the processor name 434 435 436 437 438;// Function: omxVCM4P10_TransformDequantLumaDCFromPair 439 440;//Input Registers 441ppSrc RN 0 442pDst RN 1 443QPR2 RN 2 444 445;//Output Registers 446result RN 0 447 448;//Local Scratch Registers 449pDstR4 RN 4 450pDstR0 RN 0 451QPR1 RN 1 452QPR5 RN 5 453 454;// Guarding implementation by the processor name 455 456 IF ARM1136JS 457 458 ;// Allocate stack memory required by the function 459 460 461 ;// Write function header 462 M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5 463 464 MOV pDstR4,pDst ;// Saving register r1 465 MOV QPR5,QPR2 ;// Saving register r2 466 BL armVCM4P10_UnpackBlock4x4 467 468 MOV pDstR0,pDstR4 ;// Setting up register r0 469 MOV QPR1,QPR5 ;// Setting up register r1 470 BL armVCM4P10_InvTransformDequantLumaDC4x4 471 472 473 ;// Set return value 474 MOV result,#OMX_Sts_NoErr 475 476 ;// Write function tail 477 M_END 478 479 480 ENDIF ;//ARM1136JS 481 482 483 END 484