armVCM4P10_TransformResidual4x4_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: armVCM4P10_TransformResidual4x4_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 9641 21;// Date: Thursday, February 7, 2008 22;// 23;// 24;// 25;// 26;// Description: 27;// Transform Residual 4x4 Coefficients 28;// 29;// 30 31 32;// Include standard headers 33 34 INCLUDE omxtypes_s.h 35 INCLUDE armCOMM_s.h 36 37 M_VARIANTS ARM1136JS 38 39;// Import symbols required from other files 40;// (For example tables) 41 42 43 44 45;// Set debugging level 46;//DEBUG_ON SETL {TRUE} 47 48 49 50;// Guarding implementation by the processor name 51 52 IF ARM1136JS 53 54;//Input Registers 55pDst RN 0 56pSrc RN 1 57 58;//Output Registers 59 60 61;//Local Scratch Registers 62 63;// Packed Input pixels 64in00 RN 2 ;// Src[0] & Src[1] 65in02 RN 3 ;// Src[2] & Src[3] 66in10 RN 4 ;// Src[4] & Src[5] 67in12 RN 5 ;// Src[6] & Src[7] 68in20 RN 6 ;// Src[8] & Src[9] 69in22 RN 7 ;// Src[10] & Src[11] 70in30 RN 8 ;// Src[12] & Src[13] 71in32 RN 9 ;// Src[14] & Src[15] 72 73;// Transpose for Row operations (Rows to cols) 74trRow00 RN 2 75trRow10 RN 10 76trRow02 RN 3 77trRow12 RN 5 78trRow20 RN 11 79trRow30 RN 12 80trRow32 RN 14 81trRow22 RN 7 82 83;// Intermediate calculations 84e0 RN 4 85e1 RN 6 86e2 RN 8 87e3 RN 9 88constZero RN 1 89 90;// Row operated pixels 91rowOp00 RN 2 92rowOp10 RN 10 93rowOp20 RN 11 94rowOp30 RN 12 95rowOp02 RN 3 96rowOp12 RN 5 97rowOp22 RN 7 98rowOp32 RN 14 99 100;// Transpose for colulmn operations 101trCol00 RN 2 102trCol02 RN 3 103trCol10 RN 4 104trCol12 RN 5 105trCol20 RN 6 106trCol22 RN 7 107trCol30 RN 8 108trCol32 RN 9 109 110;// Intermediate calculations 111g0 RN 10 112g1 RN 11 113g2 RN 12 114g3 RN 14 115 116;// Coloumn operated pixels 117colOp00 RN 2 118colOp02 RN 3 119colOp10 RN 4 120colOp12 RN 5 121colOp20 RN 6 122colOp22 RN 7 123colOp30 RN 8 124colOp32 RN 9 125 126 127temp1 RN 10 ;// Temporary scratch varaibles 128const1 RN 11 129const2 RN 12 130mask RN 14 131 132;// Output pixels 133out00 RN 2 134out02 RN 3 135out10 RN 4 136out12 RN 5 137out20 RN 6 138out22 RN 7 139out30 RN 8 140out32 RN 9 141 142 143 144 ;// Allocate stack memory required by the function 145 146 147 ;// Write function header 148 M_START armVCM4P10_TransformResidual4x4,r11 149 150 ;****************************************************************** 151 ;// The strategy used in implementing the transform is as follows:* 152 ;// Load the 4x4 block into 8 registers * 153 ;// Transpose the 4x4 matrix * 154 ;// Perform the row operations (on columns) using SIMD * 155 ;// Transpose the 4x4 result matrix * 156 ;// Perform the coloumn operations * 157 ;// Store the 4x4 block at one go * 158 ;****************************************************************** 159 160 ;// Load all the 4x4 pixels 161 162 LDMIA pSrc,{in00,in02,in10,in12,in20,in22,in30,in32} 163 164 MOV constZero,#0 ;// Used to right shift by 1 165 ;LDR constZero,=0x00000000 166 167 ;***************************************************************** 168 ;// 169 ;// Transpose the matrix inorder to perform row ops as coloumn ops 170 ;// Input: in[][] = original matrix 171 ;// Output: trRow[][]= transposed matrix 172 ;// Step1: Obtain the LL part of the transposed matrix 173 ;// Step2: Obtain the HL part 174 ;// step3: Obtain the LH part 175 ;// Step4: Obtain the HH part 176 ;// 177 ;***************************************************************** 178 179 ;// LL 2x2 transposed matrix 180 ;// d0 d1 - - 181 ;// d4 d5 - - 182 ;// - - - - 183 ;// - - - - 184 185 PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1] 186 PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0] 187 188 ;// HL 2x2 transposed matrix 189 ;// - - - - 190 ;// - - - - 191 ;// d8 d9 - - 192 ;// d12 d13 - - 193 194 195 PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3] 196 PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2] 197 198 ;// LH 2x2 transposed matrix 199 ;// - - d2 d3 200 ;// - - d6 d7 201 ;// - - - - 202 ;// - - - - 203 204 PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8] 205 PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9] 206 207 208 209 210 ;// HH 2x2 transposed matrix 211 ;// - - - - 212 ;// - - - - 213 ;// - - d10 d11 214 ;// - - d14 d15 215 216 PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11] 217 PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10] 218 219 220 ;**************************************** 221 ;// Row Operations (Performed on columns) 222 ;**************************************** 223 224 225 ;// SIMD operations on first two columns(two rows of the original matrix) 226 227 228 SADD16 e0, trRow00,trRow20 ;// e0 = d0 + d2 229 SSUB16 e1, trRow00,trRow20 ;// e1 = d0 - d2 230 SHADD16 e2, trRow10,constZero ;// (f1>>1) constZero is a register holding 0 231 SHADD16 e3, trRow30,constZero ;// avoid pipeline stalls for e2 and e3 232 SSUB16 e2, e2, trRow30 ;// e2 = (d1>>1) - d3 233 SADD16 e3, e3, trRow10 ;// e3 = d1 + (d3>>1) 234 SADD16 rowOp00, e0, e3 ;// f0 = e0 + e3 235 SADD16 rowOp10, e1, e2 ;// f1 = e1 + e2 236 SSUB16 rowOp20, e1, e2 ;// f2 = e1 - e2 237 SSUB16 rowOp30, e0, e3 ;// f3 = e0 - e3 238 239 ;// SIMD operations on next two columns(next two rows of the original matrix) 240 241 SADD16 e0, trRow02,trRow22 242 SSUB16 e1, trRow02,trRow22 243 SHADD16 e2, trRow12,constZero ;//(f1>>1) constZero is a register holding 0 244 SHADD16 e3, trRow32,constZero 245 SSUB16 e2, e2, trRow32 246 SADD16 e3, e3, trRow12 247 SADD16 rowOp02, e0, e3 248 SADD16 rowOp12, e1, e2 249 SSUB16 rowOp22, e1, e2 250 SSUB16 rowOp32, e0, e3 251 252 253 ;***************************************************************** 254 ;// Transpose the resultant matrix 255 ;// Input: rowOp[][] 256 ;// Output: trCol[][] 257 ;***************************************************************** 258 259 ;// LL 2x2 transposed matrix 260 ;// d0 d1 - - 261 ;// d4 d5 - - 262 ;// - - - - 263 ;// - - - - 264 265 PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1] 266 PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0] 267 268 ;// HL 2x2 transposed matrix 269 ;// - - - - 270 ;// - - - - 271 ;// d8 d9 - - 272 ;// d12 d13 - - 273 274 275 PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3] 276 PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2] 277 278 ;// LH 2x2 transposed matrix 279 ;// - - d2 d3 280 ;// - - d6 d7 281 ;// - - - - 282 ;// - - - - 283 284 PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8] 285 PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9] 286 287 288 289 290 ;// HH 2x2 transposed matrix 291 ;// - - - - 292 ;// - - - - 293 ;// - - d10 d11 294 ;// - - d14 d15 295 296 PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11] 297 PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10] 298 299 300 ;******************************* 301 ;// Coloumn Operations 302 ;******************************* 303 304 305 ;// SIMD operations on first two columns 306 307 308 SADD16 g0, trCol00,trCol20 309 SSUB16 g1, trCol00,trCol20 310 SHADD16 g2, trCol10,constZero ;// (f1>>1) constZero is a register holding 0 311 SHADD16 g3, trCol30,constZero 312 SSUB16 g2, g2, trCol30 313 SADD16 g3, g3, trCol10 314 SADD16 colOp00, g0, g3 315 SADD16 colOp10, g1, g2 316 SSUB16 colOp20, g1, g2 317 SSUB16 colOp30, g0, g3 318 319 ;// SIMD operations on next two columns 320 321 SADD16 g0, trCol02,trCol22 322 SSUB16 g1, trCol02,trCol22 323 SHADD16 g2, trCol12,constZero ;// (f1>>1) constZero is a register holding 0 324 SHADD16 g3, trCol32,constZero 325 SSUB16 g2, g2, trCol32 326 SADD16 g3, g3, trCol12 327 SADD16 colOp02, g0, g3 328 SADD16 colOp12, g1, g2 329 SSUB16 colOp22, g1, g2 330 SSUB16 colOp32, g0, g3 331 332 333 334 335 336 ;************************************************ 337 ;// Calculate final value (colOp[i][j] + 32)>>6 338 ;************************************************ 339 340 ;// const1: Serves dual purpose 341 ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result 342 ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768) 343 344 LDR const1, =0x00208020 345 346 LDR mask, =0xffff03ff ;// Used to mask the down shifted 6 bits 347 348 ;// const2(#512): used to convert the lower 16bit number back to signed value 349 350 MOV const2,#0x200 ;// const2 = 2^9 351 352 ;// First Row 353 354 SADD16 colOp00, colOp00, const1 355 SADD16 colOp02, colOp02, const1 356 AND colOp00, mask, colOp00, ASR #6 357 AND colOp02, mask, colOp02, ASR #6 358 SSUB16 out00,colOp00,const2 359 SSUB16 out02,colOp02,const2 360 361 362 ;// Second Row 363 364 SADD16 colOp10, colOp10, const1 365 SADD16 colOp12, colOp12, const1 366 AND colOp10, mask, colOp10, ASR #6 367 AND colOp12, mask, colOp12, ASR #6 368 SSUB16 out10,colOp10,const2 369 SSUB16 out12,colOp12,const2 370 371 372 ;// Third Row 373 374 SADD16 colOp20, colOp20, const1 375 SADD16 colOp22, colOp22, const1 376 AND colOp20, mask, colOp20, ASR #6 377 AND colOp22, mask, colOp22, ASR #6 378 SSUB16 out20,colOp20,const2 379 SSUB16 out22,colOp22,const2 380 381 382 ;// Fourth Row 383 384 SADD16 colOp30, colOp30, const1 385 SADD16 colOp32, colOp32, const1 386 AND colOp30, mask, colOp30, ASR #6 387 AND colOp32, mask, colOp32, ASR #6 388 SSUB16 out30,colOp30,const2 389 SSUB16 out32,colOp32,const2 390 391 392 393 394 ;*************************** 395 ;// Store all the 4x4 pixels 396 ;*************************** 397 398 STMIA pDst,{out00,out02,out10,out12,out20,out22,out30,out32} 399 400 401 402 ;// Set return value 403 404End 405 406 407 ;// Write function tail 408 M_END 409 410 ENDIF ;//ARM1136JS 411 412 413 414 415 416 417 418;// Guarding implementation by the processor name 419 420 421 END 422