1;// 2;// 3;// File Name: armVCM4P10_TransformResidual4x4_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 9641 6;// Date: Thursday, February 7, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12;// Description: 13;// Transform Residual 4x4 Coefficients 14;// 15;// 16 17 18;// Include standard headers 19 20 INCLUDE omxtypes_s.h 21 INCLUDE armCOMM_s.h 22 23 M_VARIANTS ARM1136JS 24 25;// Import symbols required from other files 26;// (For example tables) 27 28 29 30 31;// Set debugging level 32;//DEBUG_ON SETL {TRUE} 33 34 35 36;// Guarding implementation by the processor name 37 38 IF ARM1136JS 39 40;//Input Registers 41pDst RN 0 42pSrc RN 1 43 44;//Output Registers 45 46 47;//Local Scratch Registers 48 49;// Packed Input pixels 50in00 RN 2 ;// Src[0] & Src[1] 51in02 RN 3 ;// Src[2] & Src[3] 52in10 RN 4 ;// Src[4] & Src[5] 53in12 RN 5 ;// Src[6] & Src[7] 54in20 RN 6 ;// Src[8] & Src[9] 55in22 RN 7 ;// Src[10] & Src[11] 56in30 RN 8 ;// Src[12] & Src[13] 57in32 RN 9 ;// Src[14] & Src[15] 58 59;// Transpose for Row operations (Rows to cols) 60trRow00 RN 2 61trRow10 RN 10 62trRow02 RN 3 63trRow12 RN 5 64trRow20 RN 11 65trRow30 RN 12 66trRow32 RN 14 67trRow22 RN 7 68 69;// Intermediate calculations 70e0 RN 4 71e1 RN 6 72e2 RN 8 73e3 RN 9 74constZero RN 1 75 76;// Row operated pixels 77rowOp00 RN 2 78rowOp10 RN 10 79rowOp20 RN 11 80rowOp30 RN 12 81rowOp02 RN 3 82rowOp12 RN 5 83rowOp22 RN 7 84rowOp32 RN 14 85 86;// Transpose for colulmn operations 87trCol00 RN 2 88trCol02 RN 3 89trCol10 RN 4 90trCol12 RN 5 91trCol20 RN 6 92trCol22 RN 7 93trCol30 RN 8 94trCol32 RN 9 95 96;// Intermediate calculations 97g0 RN 10 98g1 RN 11 99g2 RN 12 100g3 RN 14 101 102;// Coloumn operated pixels 103colOp00 RN 2 104colOp02 RN 3 105colOp10 RN 4 106colOp12 RN 5 107colOp20 RN 6 108colOp22 RN 7 109colOp30 RN 8 110colOp32 RN 9 111 112 113temp1 RN 10 ;// Temporary scratch varaibles 114const1 RN 11 115const2 RN 12 116mask RN 14 117 118;// Output pixels 119out00 RN 2 120out02 RN 3 121out10 RN 4 122out12 RN 5 123out20 RN 6 124out22 RN 7 125out30 RN 8 126out32 RN 9 127 128 129 130 ;// Allocate stack memory required by the function 131 132 133 ;// Write function header 134 M_START armVCM4P10_TransformResidual4x4,r11 135 136 ;****************************************************************** 137 ;// The strategy used in implementing the transform is as follows:* 138 ;// Load the 4x4 block into 8 registers * 139 ;// Transpose the 4x4 matrix * 140 ;// Perform the row operations (on columns) using SIMD * 141 ;// Transpose the 4x4 result matrix * 142 ;// Perform the coloumn operations * 143 ;// Store the 4x4 block at one go * 144 ;****************************************************************** 145 146 ;// Load all the 4x4 pixels 147 148 LDMIA pSrc,{in00,in02,in10,in12,in20,in22,in30,in32} 149 150 MOV constZero,#0 ;// Used to right shift by 1 151 ;LDR constZero,=0x00000000 152 153 ;***************************************************************** 154 ;// 155 ;// Transpose the matrix inorder to perform row ops as coloumn ops 156 ;// Input: in[][] = original matrix 157 ;// Output: trRow[][]= transposed matrix 158 ;// Step1: Obtain the LL part of the transposed matrix 159 ;// Step2: Obtain the HL part 160 ;// step3: Obtain the LH part 161 ;// Step4: Obtain the HH part 162 ;// 163 ;***************************************************************** 164 165 ;// LL 2x2 transposed matrix 166 ;// d0 d1 - - 167 ;// d4 d5 - - 168 ;// - - - - 169 ;// - - - - 170 171 PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1] 172 PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0] 173 174 ;// HL 2x2 transposed matrix 175 ;// - - - - 176 ;// - - - - 177 ;// d8 d9 - - 178 ;// d12 d13 - - 179 180 181 PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3] 182 PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2] 183 184 ;// LH 2x2 transposed matrix 185 ;// - - d2 d3 186 ;// - - d6 d7 187 ;// - - - - 188 ;// - - - - 189 190 PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8] 191 PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9] 192 193 194 195 196 ;// HH 2x2 transposed matrix 197 ;// - - - - 198 ;// - - - - 199 ;// - - d10 d11 200 ;// - - d14 d15 201 202 PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11] 203 PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10] 204 205 206 ;**************************************** 207 ;// Row Operations (Performed on columns) 208 ;**************************************** 209 210 211 ;// SIMD operations on first two columns(two rows of the original matrix) 212 213 214 SADD16 e0, trRow00,trRow20 ;// e0 = d0 + d2 215 SSUB16 e1, trRow00,trRow20 ;// e1 = d0 - d2 216 SHADD16 e2, trRow10,constZero ;// (f1>>1) constZero is a register holding 0 217 SHADD16 e3, trRow30,constZero ;// avoid pipeline stalls for e2 and e3 218 SSUB16 e2, e2, trRow30 ;// e2 = (d1>>1) - d3 219 SADD16 e3, e3, trRow10 ;// e3 = d1 + (d3>>1) 220 SADD16 rowOp00, e0, e3 ;// f0 = e0 + e3 221 SADD16 rowOp10, e1, e2 ;// f1 = e1 + e2 222 SSUB16 rowOp20, e1, e2 ;// f2 = e1 - e2 223 SSUB16 rowOp30, e0, e3 ;// f3 = e0 - e3 224 225 ;// SIMD operations on next two columns(next two rows of the original matrix) 226 227 SADD16 e0, trRow02,trRow22 228 SSUB16 e1, trRow02,trRow22 229 SHADD16 e2, trRow12,constZero ;//(f1>>1) constZero is a register holding 0 230 SHADD16 e3, trRow32,constZero 231 SSUB16 e2, e2, trRow32 232 SADD16 e3, e3, trRow12 233 SADD16 rowOp02, e0, e3 234 SADD16 rowOp12, e1, e2 235 SSUB16 rowOp22, e1, e2 236 SSUB16 rowOp32, e0, e3 237 238 239 ;***************************************************************** 240 ;// Transpose the resultant matrix 241 ;// Input: rowOp[][] 242 ;// Output: trCol[][] 243 ;***************************************************************** 244 245 ;// LL 2x2 transposed matrix 246 ;// d0 d1 - - 247 ;// d4 d5 - - 248 ;// - - - - 249 ;// - - - - 250 251 PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1] 252 PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0] 253 254 ;// HL 2x2 transposed matrix 255 ;// - - - - 256 ;// - - - - 257 ;// d8 d9 - - 258 ;// d12 d13 - - 259 260 261 PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3] 262 PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2] 263 264 ;// LH 2x2 transposed matrix 265 ;// - - d2 d3 266 ;// - - d6 d7 267 ;// - - - - 268 ;// - - - - 269 270 PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8] 271 PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9] 272 273 274 275 276 ;// HH 2x2 transposed matrix 277 ;// - - - - 278 ;// - - - - 279 ;// - - d10 d11 280 ;// - - d14 d15 281 282 PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11] 283 PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10] 284 285 286 ;******************************* 287 ;// Coloumn Operations 288 ;******************************* 289 290 291 ;// SIMD operations on first two columns 292 293 294 SADD16 g0, trCol00,trCol20 295 SSUB16 g1, trCol00,trCol20 296 SHADD16 g2, trCol10,constZero ;// (f1>>1) constZero is a register holding 0 297 SHADD16 g3, trCol30,constZero 298 SSUB16 g2, g2, trCol30 299 SADD16 g3, g3, trCol10 300 SADD16 colOp00, g0, g3 301 SADD16 colOp10, g1, g2 302 SSUB16 colOp20, g1, g2 303 SSUB16 colOp30, g0, g3 304 305 ;// SIMD operations on next two columns 306 307 SADD16 g0, trCol02,trCol22 308 SSUB16 g1, trCol02,trCol22 309 SHADD16 g2, trCol12,constZero ;// (f1>>1) constZero is a register holding 0 310 SHADD16 g3, trCol32,constZero 311 SSUB16 g2, g2, trCol32 312 SADD16 g3, g3, trCol12 313 SADD16 colOp02, g0, g3 314 SADD16 colOp12, g1, g2 315 SSUB16 colOp22, g1, g2 316 SSUB16 colOp32, g0, g3 317 318 319 320 321 322 ;************************************************ 323 ;// Calculate final value (colOp[i][j] + 32)>>6 324 ;************************************************ 325 326 ;// const1: Serves dual purpose 327 ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result 328 ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768) 329 330 LDR const1, =0x00208020 331 332 LDR mask, =0xffff03ff ;// Used to mask the down shifted 6 bits 333 334 ;// const2(#512): used to convert the lower 16bit number back to signed value 335 336 MOV const2,#0x200 ;// const2 = 2^9 337 338 ;// First Row 339 340 SADD16 colOp00, colOp00, const1 341 SADD16 colOp02, colOp02, const1 342 AND colOp00, mask, colOp00, ASR #6 343 AND colOp02, mask, colOp02, ASR #6 344 SSUB16 out00,colOp00,const2 345 SSUB16 out02,colOp02,const2 346 347 348 ;// Second Row 349 350 SADD16 colOp10, colOp10, const1 351 SADD16 colOp12, colOp12, const1 352 AND colOp10, mask, colOp10, ASR #6 353 AND colOp12, mask, colOp12, ASR #6 354 SSUB16 out10,colOp10,const2 355 SSUB16 out12,colOp12,const2 356 357 358 ;// Third Row 359 360 SADD16 colOp20, colOp20, const1 361 SADD16 colOp22, colOp22, const1 362 AND colOp20, mask, colOp20, ASR #6 363 AND colOp22, mask, colOp22, ASR #6 364 SSUB16 out20,colOp20,const2 365 SSUB16 out22,colOp22,const2 366 367 368 ;// Fourth Row 369 370 SADD16 colOp30, colOp30, const1 371 SADD16 colOp32, colOp32, const1 372 AND colOp30, mask, colOp30, ASR #6 373 AND colOp32, mask, colOp32, ASR #6 374 SSUB16 out30,colOp30,const2 375 SSUB16 out32,colOp32,const2 376 377 378 379 380 ;*************************** 381 ;// Store all the 4x4 pixels 382 ;*************************** 383 384 STMIA pDst,{out00,out02,out10,out12,out20,out22,out30,out32} 385 386 387 388 ;// Set return value 389 390End 391 392 393 ;// Write function tail 394 M_END 395 396 ENDIF ;//ARM1136JS 397 398 399 400 401 402 403 404;// Guarding implementation by the processor name 405 406 407 END