1;// 2;// This confidential and proprietary software may be used only as 3;// authorised by a licensing agreement from ARM Limited 4;// (C) COPYRIGHT 2004 ARM Limited 5;// ALL RIGHTS RESERVED 6;// The entire notice above must be reproduced on all authorised 7;// copies and copies may only be made to the extent permitted 8;// by a licensing agreement from ARM Limited. 9;// 10;// IDCT_s.s 11;// 12;// Inverse DCT module 13;// 14;// 15;// ALGORITHM DESCRIPTION 16;// 17;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each 18;// column and then a 1D IDCT for each row. 19;// 20;// The 8-point 1D IDCT is defined by 21;// f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2 22;// 23;// C(u) = 1/sqrt(2) if u=0 or 1 if u!=0 24;// c(u,x) = cos( (2x+1)*u*pi/16 ) 25;// 26;// We compute the 8-point 1D IDCT using the reverse of 27;// the Arai-Agui-Nakajima flow graph which we split into 28;// 5 stages named in reverse order to identify with the 29;// forward DCT. Direct inversion of the forward formulae 30;// in file FDCT_s.s gives: 31;// 32;// IStage 5: j(u) = T(u)*A(u) [ A(u)=4*C(u)*c(u,0) ] 33;// [ A(0) = 2*sqrt(2) 34;// A(u) = 4*cos(u*pi/16) for (u!=0) ] 35;// 36;// IStage 4: i0 = j0 i1 = j4 37;// i3 = (j2+j6)/2 i2 = (j2-j6)/2 38;// i7 = (j5+j3)/2 i4 = (j5-j3)/2 39;// i5 = (j1+j7)/2 i6 = (j1-j7)/2 40;// 41;// IStage 3: h0 = (i0+i1)/2 h1 = (i0-i1)/2 42;// h2 = (i2*sqrt2)-i3 h3 = i3 43;// h4 = cos(pi/8)*i4 + sin(pi/8)*i6 44;// h6 = -sin(pi/8)*i4 + cos(pi/8)*i6 45;// [ The above two lines rotate by -(pi/8) ] 46;// h5 = (i5-i7)/sqrt2 h7 = (i5+i7)/2 47;// 48;// IStage 2: g0 = (h0+h3)/2 g3 = (h0-h3)/2 49;// g1 = (h1+h2)/2 g2 = (h1-h2)/2 50;// g7 = h7 g6 = h6 - h7 51;// g5 = h5 - g6 g4 = h4 - g5 52;// 53;// IStage 1: f0 = (g0+g7)/2 f7 = (g0-g7)/2 54;// f1 = (g1+g6)/2 f6 = (g1-g6)/2 55;// f2 = (g2+g5)/2 f5 = (g2-g5)/2 56;// f3 = (g3+g4)/2 f4 = (g3-g4)/2 57;// 58;// Note that most coefficients are halved 3 times during the 59;// above calculation. We can rescale the algorithm dividing 60;// the input by 8 to remove the halvings. 61;// 62;// IStage 5: j(u) = T(u)*A(u)/8 63;// 64;// IStage 4: i0 = j0 i1 = j4 65;// i3 = j2 + j6 i2 = j2 - j6 66;// i7 = j5 + j3 i4 = j5 - j3 67;// i5 = j1 + j7 i6 = j1 - j7 68;// 69;// IStage 3: h0 = i0 + i1 h1 = i0 - i1 70;// h2 = (i2*sqrt2)-i3 h3 = i3 71;// h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6) 72;// h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6) 73;// h5 = (i5-i7)*sqrt2 h7 = i5 + i7 74;// 75;// IStage 2: g0 = h0 + h3 g3 = h0 - h3 76;// g1 = h1 + h2 g2 = h1 - h2 77;// g7 = h7 g6 = h6 - h7 78;// g5 = h5 - g6 g4 = h4 - g5 79;// 80;// IStage 1: f0 = g0 + g7 f7 = g0 - g7 81;// f1 = g1 + g6 f6 = g1 - g6 82;// f2 = g2 + g5 f5 = g2 - g5 83;// f3 = g3 + g4 f4 = g3 - g4 84;// 85;// Note: 86;// 1. The scaling by A(u)/8 can often be combined with inverse 87;// quantization. The column and row scalings can be combined. 88;// 2. The flowgraph in the AAN paper has h4,g6 negated compared 89;// to the above code but is otherwise identical. 90;// 3. The rotation by -pi/8 can be peformed using three multiplies 91;// Eg c*i4+s*i6 = (i6-i4)*s + (c+s)*i4 92;// -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6 93;// 4. If |T(u)|<=1 then from the IDCT definition, 94;// |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2 95;// = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2 96;// = ((1/sqrt2) + (cot(pi/32)-1)/2)/2 97;// = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2) 98;// = (approx)2.64 99;// So the max gain of the 2D IDCT is ~x7.0 = 3 bits. 100;// The table below shows input patterns generating the maximum 101;// value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1 102;// InputPattern Max |f(x)| 103;// PPPPPPPP |f0| = 2.64 104;// PPPMMMMM |f1| = 2.64 105;// PPMMMPPP |f2| = 2.64 106;// PPMMPPMM |f3| = 2.64 107;// PMMPPMMP |f4| = 2.64 108;// PMMPMMPM |f5| = 2.64 109;// PMPPMPMP |f6| = 2.64 110;// PMPMPMPM |f7| = 2.64 111;// Note that this input pattern is the transpose of the 112;// corresponding max input patter for the FDCT. 113 114;// Arguments 115 116pSrc RN 0 ;// source data buffer 117Stride RN 1 ;// destination stride in bytes 118pDest RN 2 ;// destination data buffer 119pScale RN 3 ;// pointer to scaling table 120 121 122 ;// DCT Inverse Macro 123 ;// The DCT code should be parametrized according 124 ;// to the following inputs: 125 ;// $outsize = "u8" : 8-bit unsigned data saturated (0 to +255) 126 ;// "s9" : 16-bit signed data saturated to 9-bit (-256 to +255) 127 ;// "s16" : 16-bit signed data not saturated (max size ~+/-14273) 128 ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment 129 ;// "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment 130 ;// 131 ;// Inputs: 132 ;// pSrc = r0 = Pointer to input data 133 ;// Range is -256 to +255 (9-bit) 134 ;// Stride = r1 = Stride between input lines 135 ;// pDest = r2 = Pointer to output data 136 ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale 137 138 139 140 MACRO 141 M_IDCT $outsize, $inscale, $stride 142 LCLA SHIFT 143 144 145 IF ARM1136JS 146 147;// REGISTER ALLOCATION 148;// This is hard since we have 8 values, 9 free registers and each 149;// butterfly requires a temporary register. We also want to 150;// maintain register order so we can use LDM/STM. The table below 151;// summarises the register allocation that meets all these criteria. 152;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above. 153;// 154;// r1 a01 g0 h0 155;// r4 b01 f0 g1 h1 i0 156;// r5 a23 f1 g2 i1 157;// r6 b23 f2 g3 h2 i2 158;// r7 a45 f3 h3 i3 159;// r8 b45 f4 g4 h4 i4 160;// r9 a67 f5 g5 h5 i5 161;// r10 b67 f6 g6 h6 i6 162;// r11 f7 g7 h7 i7 163;// 164ra01 RN 1 165rb01 RN 4 166ra23 RN 5 167rb23 RN 6 168ra45 RN 7 169rb45 RN 8 170ra67 RN 9 171rb67 RN 10 172rtmp RN 11 173csPiBy8 RN 12 ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ] 174LoopRR2 RN 14 ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ] 175;// Transpose allocation 176xft RN ra01 177xf0 RN rb01 178xf1 RN ra23 179xf2 RN rb23 180xf3 RN ra45 181xf4 RN rb45 182xf5 RN ra67 183xf6 RN rb67 184xf7 RN rtmp 185;// IStage 1 allocation 186xg0 RN xft 187xg1 RN xf0 188xg2 RN xf1 189xg3 RN xf2 190xgt RN xf3 191xg4 RN xf4 192xg5 RN xf5 193xg6 RN xf6 194xg7 RN xf7 195;// IStage 2 allocation 196xh0 RN xg0 197xh1 RN xg1 198xht RN xg2 199xh2 RN xg3 200xh3 RN xgt 201xh4 RN xg4 202xh5 RN xg5 203xh6 RN xg6 204xh7 RN xg7 205;// IStage 3,4 allocation 206xit RN xh0 207xi0 RN xh1 208xi1 RN xht 209xi2 RN xh2 210xi3 RN xh3 211xi4 RN xh4 212xi5 RN xh5 213xi6 RN xh6 214xi7 RN xh7 215 216 M_STR pDest, ppDest 217 IF "$stride"="s" 218 M_STR Stride, pStride 219 ENDIF 220 M_ADR pDest, pBlk 221 LDR csPiBy8, =0x30fc7642 222 LDR LoopRR2, =0x00005a82 223 224v6_idct_col$_F 225 ;// Load even values 226 LDR xi4, [pSrc], #4 ;// j0 227 LDR xi5, [pSrc, #4*16-4] ;// j4 228 LDR xi6, [pSrc, #2*16-4] ;// j2 229 LDR xi7, [pSrc, #6*16-4] ;// j6 230 231 ;// Scale Even Values 232 IF "$inscale"="s16" ;// 16x16 mul 233SHIFT SETA 12 234 LDR xi0, [pScale], #4 235 LDR xi1, [pScale, #4*16-4] 236 LDR xi2, [pScale, #2*16-4] 237 MOV xit, #1<<(SHIFT-1) 238 SMLABB xi3, xi0, xi4, xit 239 SMLATT xi4, xi0, xi4, xit 240 SMLABB xi0, xi1, xi5, xit 241 SMLATT xi5, xi1, xi5, xit 242 MOV xi3, xi3, ASR #SHIFT 243 PKHBT xi4, xi3, xi4, LSL #(16-SHIFT) 244 LDR xi3, [pScale, #6*16-4] 245 SMLABB xi1, xi2, xi6, xit 246 SMLATT xi6, xi2, xi6, xit 247 MOV xi0, xi0, ASR #SHIFT 248 PKHBT xi5, xi0, xi5, LSL #(16-SHIFT) 249 SMLABB xi2, xi3, xi7, xit 250 SMLATT xi7, xi3, xi7, xit 251 MOV xi1, xi1, ASR #SHIFT 252 PKHBT xi6, xi1, xi6, LSL #(16-SHIFT) 253 MOV xi2, xi2, ASR #SHIFT 254 PKHBT xi7, xi2, xi7, LSL #(16-SHIFT) 255 ENDIF 256 IF "$inscale"="s32" ;// 32x16 mul 257SHIFT SETA (12+8-16) 258 MOV xit, #1<<(SHIFT-1) 259 LDR xi0, [pScale], #8 260 LDR xi1, [pScale, #0*32+4-8] 261 LDR xi2, [pScale, #4*32-8] 262 LDR xi3, [pScale, #4*32+4-8] 263 SMLAWB xi0, xi0, xi4, xit 264 SMLAWT xi1, xi1, xi4, xit 265 SMLAWB xi2, xi2, xi5, xit 266 SMLAWT xi3, xi3, xi5, xit 267 MOV xi0, xi0, ASR #SHIFT 268 PKHBT xi4, xi0, xi1, LSL #(16-SHIFT) 269 MOV xi2, xi2, ASR #SHIFT 270 PKHBT xi5, xi2, xi3, LSL #(16-SHIFT) 271 LDR xi0, [pScale, #2*32-8] 272 LDR xi1, [pScale, #2*32+4-8] 273 LDR xi2, [pScale, #6*32-8] 274 LDR xi3, [pScale, #6*32+4-8] 275 SMLAWB xi0, xi0, xi6, xit 276 SMLAWT xi1, xi1, xi6, xit 277 SMLAWB xi2, xi2, xi7, xit 278 SMLAWT xi3, xi3, xi7, xit 279 MOV xi0, xi0, ASR #SHIFT 280 PKHBT xi6, xi0, xi1, LSL #(16-SHIFT) 281 MOV xi2, xi2, ASR #SHIFT 282 PKHBT xi7, xi2, xi3, LSL #(16-SHIFT) 283 ENDIF 284 285 ;// Load odd values 286 LDR xi0, [pSrc, #1*16-4] ;// j1 287 LDR xi1, [pSrc, #7*16-4] ;// j7 288 LDR xi2, [pSrc, #5*16-4] ;// j5 289 LDR xi3, [pSrc, #3*16-4] ;// j3 290 291 IF {TRUE} 292 ;// shortcut if odd values 0 293 TEQ xi0, #0 294 TEQEQ xi1, #0 295 TEQEQ xi2, #0 296 TEQEQ xi3, #0 297 BEQ v6OddZero$_F 298 ENDIF 299 300 ;// Store scaled even values 301 STMIA pDest, {xi4, xi5, xi6, xi7} 302 303 ;// Scale odd values 304 IF "$inscale"="s16" 305 ;// Perform AAN Scale 306 LDR xi4, [pScale, #1*16-4] 307 LDR xi5, [pScale, #7*16-4] 308 LDR xi6, [pScale, #5*16-4] 309 SMLABB xi7, xi0, xi4, xit 310 SMLATT xi0, xi0, xi4, xit 311 SMLABB xi4, xi1, xi5, xit 312 SMLATT xi1, xi1, xi5, xit 313 MOV xi7, xi7, ASR #SHIFT 314 PKHBT xi0, xi7, xi0, LSL #(16-SHIFT) 315 LDR xi7, [pScale, #3*16-4] 316 SMLABB xi5, xi2, xi6, xit 317 SMLATT xi2, xi2, xi6, xit 318 MOV xi4, xi4, ASR #SHIFT 319 PKHBT xi1, xi4, xi1, LSL #(16-SHIFT) 320 SMLABB xi6, xi3, xi7, xit 321 SMLATT xi3, xi3, xi7, xit 322 MOV xi5, xi5, ASR #SHIFT 323 PKHBT xi2, xi5, xi2, LSL #(16-SHIFT) 324 MOV xi6, xi6, ASR #SHIFT 325 PKHBT xi3, xi6, xi3, LSL #(16-SHIFT) 326 ENDIF 327 IF "$inscale"="s32" ;// 32x16 mul 328 LDR xi4, [pScale, #1*32-8] 329 LDR xi5, [pScale, #1*32+4-8] 330 LDR xi6, [pScale, #7*32-8] 331 LDR xi7, [pScale, #7*32+4-8] 332 SMLAWB xi4, xi4, xi0, xit 333 SMLAWT xi5, xi5, xi0, xit 334 SMLAWB xi6, xi6, xi1, xit 335 SMLAWT xi7, xi7, xi1, xit 336 MOV xi4, xi4, ASR #SHIFT 337 PKHBT xi0, xi4, xi5, LSL #(16-SHIFT) 338 MOV xi6, xi6, ASR #SHIFT 339 PKHBT xi1, xi6, xi7, LSL #(16-SHIFT) 340 LDR xi4, [pScale, #5*32-8] 341 LDR xi5, [pScale, #5*32+4-8] 342 LDR xi6, [pScale, #3*32-8] 343 LDR xi7, [pScale, #3*32+4-8] 344 SMLAWB xi4, xi4, xi2, xit 345 SMLAWT xi5, xi5, xi2, xit 346 SMLAWB xi6, xi6, xi3, xit 347 SMLAWT xi7, xi7, xi3, xit 348 MOV xi4, xi4, ASR #SHIFT 349 PKHBT xi2, xi4, xi5, LSL #(16-SHIFT) 350 MOV xi6, xi6, ASR #SHIFT 351 PKHBT xi3, xi6, xi7, LSL #(16-SHIFT) 352 ENDIF 353 354 LDR xit, =0x00010001 ;// rounding constant 355 SADD16 xi5, xi0, xi1 ;// (j1+j7)/2 356 SHADD16 xi5, xi5, xit 357 358 SSUB16 xi6, xi0, xi1 ;// j1-j7 359 SADD16 xi7, xi2, xi3 ;// (j5+j3)/2 360 SHADD16 xi7, xi7, xit 361 362 SSUB16 xi4, xi2, xi3 ;// j5-j3 363 364 SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2 365 366 PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a 367 PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b 368 369 SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s] 370 SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s] 371 SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c] 372 SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c] 373 374 SMULBB xi1, xi3, LoopRR2 375 SMULTB xi3, xi3, LoopRR2 376 377 PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4 378 PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4 379 SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4 380 381 ;// xi0,xi1,xi2,xi3 now free 382 ;// IStage 4,3, rows 2to3 x1/2 383 384 MOV xi3, xi3, LSL #1 385 PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4 386 LDRD xi0, [pDest, #8] ;// j2,j6 scaled 387 388 ;// IStage 2, rows4to7 389 SSUB16 xg6, xh6, xh7 390 SSUB16 xg5, xh5, xg6 391 SSUB16 xg4, xh4, xg5 392 393 SSUB16 xi2, xi0, xi1 ;// (j2-j6) 394 395 SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2 396 397 SMULBB xi0, xi2, LoopRR2 398 SMULTB xi2, xi2, LoopRR2 399 400 MOV xi2, xi2, LSL #1 401 PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 402 403 ;// xi0, xi1 now free 404 ;// IStage 4,3 rows 0to1 x 1/2 405 LDRD xi0, [pDest] ;// j0, j4 scaled 406 SSUB16 xh2, xh2, xi3 407 ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows 408 409 SHADD16 xh0, xi0, xi1 410 SHSUB16 xh1, xi0, xi1 411 412 ;// IStage 2 rows 0to3 x 1/2 413 SHSUB16 xg2, xh1, xh2 414 SHADD16 xg1, xh1, xh2 415 SHSUB16 xg3, xh0, xh3 416 SHADD16 xg0, xh0, xh3 417 418 ;// IStage 1 all rows 419 SADD16 xf3, xg3, xg4 420 SSUB16 xf4, xg3, xg4 421 SADD16 xf2, xg2, xg5 422 SSUB16 xf5, xg2, xg5 423 SADD16 xf1, xg1, xg6 424 SSUB16 xf6, xg1, xg6 425 SADD16 xf0, xg0, xg7 426 SSUB16 xf7, xg0, xg7 427 428 ;// Transpose, store and loop 429 PKHBT ra01, xf0, xf1, LSL #16 430 PKHTB rb01, xf1, xf0, ASR #16 431 432 PKHBT ra23, xf2, xf3, LSL #16 433 PKHTB rb23, xf3, xf2, ASR #16 434 435 PKHBT ra45, xf4, xf5, LSL #16 436 PKHTB rb45, xf5, xf4, ASR #16 437 438 PKHBT ra67, xf6, xf7, LSL #16 439 STMIA pDest!, {ra01, ra23, ra45, ra67} 440 PKHTB rb67, xf7, xf6, ASR #16 441 STMIA pDest!, {rb01, rb23, rb45, rb67} 442 BCC v6_idct_col$_F 443 444 SUB pSrc, pDest, #(64*2) 445 M_LDR pDest, ppDest 446 IF "$stride"="s" 447 M_LDR pScale, pStride 448 ENDIF 449 B v6_idct_row$_F 450 451v6OddZero$_F 452 SSUB16 xi2, xi6, xi7 ;// (j2-j6) 453 SHADD16 xi3, xi6, xi7 ;// (j2+j6)/2 454 455 SMULBB xi0, xi2, LoopRR2 456 SMULTB xi2, xi2, LoopRR2 457 458 MOV xi2, xi2, LSL #1 459 PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 460 SSUB16 xh2, xh2, xi3 461 462 ;// xi0, xi1 now free 463 ;// IStage 4,3 rows 0to1 x 1/2 464 465 SHADD16 xh0, xi4, xi5 466 SHSUB16 xh1, xi4, xi5 467 468 ;// IStage 2 rows 0to3 x 1/2 469 SHSUB16 xg2, xh1, xh2 470 SHADD16 xg1, xh1, xh2 471 SHSUB16 xg3, xh0, xh3 472 SHADD16 xg0, xh0, xh3 473 474 ;// IStage 1 all rows 475 MOV xf3, xg3 476 MOV xf4, xg3 477 MOV xf2, xg2 478 MOV xf5, xg2 479 MOV xf1, xg1 480 MOV xf6, xg1 481 MOV xf0, xg0 482 MOV xf7, xg0 483 484 ;// Transpose 485 PKHBT ra01, xf0, xf1, LSL #16 486 PKHTB rb01, xf1, xf0, ASR #16 487 488 PKHBT ra23, xf2, xf3, LSL #16 489 PKHTB rb23, xf3, xf2, ASR #16 490 491 PKHBT ra45, xf4, xf5, LSL #16 492 PKHTB rb45, xf5, xf4, ASR #16 493 494 PKHBT ra67, xf6, xf7, LSL #16 495 PKHTB rb67, xf7, xf6, ASR #16 496 497 STMIA pDest!, {ra01, ra23, ra45, ra67} 498 ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows 499 STMIA pDest!, {rb01, rb23, rb45, rb67} 500 501 BCC v6_idct_col$_F 502 SUB pSrc, pDest, #(64*2) 503 M_LDR pDest, ppDest 504 IF "$stride"="s" 505 M_LDR pScale, pStride 506 ENDIF 507 508 509v6_idct_row$_F 510 ;// IStage 4,3, rows4to7 x1/4 511 LDR xit, =0x00010001 ;// rounding constant 512 LDR xi0, [pSrc, #1*16] ;// j1 513 LDR xi1, [pSrc, #7*16] ;// 4*j7 514 LDR xi2, [pSrc, #5*16] ;// j5 515 LDR xi3, [pSrc, #3*16] ;// j3 516 517 SHADD16 xi1, xi1, xit ;// 2*j7 518 SHADD16 xi1, xi1, xit ;// j7 519 520 SHADD16 xi5, xi0, xi1 ;// (j1+j7)/2 521 SSUB16 xi6, xi0, xi1 ;// j1-j7 522 SHADD16 xi7, xi2, xi3 ;// (j5+j3)/2 523 SSUB16 xi4, xi2, xi3 ;// j5-j3 524 525 SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2 526 527 PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a 528 PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b 529 530 SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s] 531 SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s] 532 SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c] 533 SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c] 534 535 SMULBB xi1, xi3, LoopRR2 536 SMULTB xi3, xi3, LoopRR2 537 538 PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4 539 PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4 540 SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4 541 542 MOV xi3, xi3, LSL #1 543 PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4 544 545 ;// xi0,xi1,xi2,xi3 now free 546 ;// IStage 4,3, rows 2to3 x1/2 547 548 LDR xi0, [pSrc, #2*16] ;// j2 549 LDR xi1, [pSrc, #6*16] ;// 2*j6 550 551 ;// IStage 2, rows4to7 552 SSUB16 xg6, xh6, xh7 553 SSUB16 xg5, xh5, xg6 554 SSUB16 xg4, xh4, xg5 555 556 SHADD16 xi1, xi1, xit ;// j6 557 SSUB16 xi2, xi0, xi1 ;// (j2-j6) 558 SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2 559 560 SMULBB xi0, xi2, LoopRR2 561 SMULTB xi2, xi2, LoopRR2 562 563 MOV xi2, xi2, LSL #1 564 565 PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 566 567 ;// xi0, xi1 now free 568 ;// IStage 4,3 rows 0to1 x 1/2 569 LDR xi1, [pSrc, #4*16] ;// j4 570 LDR xi0, [pSrc], #4 ;// j0 571 572 SSUB16 xh2, xh2, xi3 573 ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows 574 575 ADD xi0, xi0, xit, LSL #2 ;// ensure correct round 576 SHADD16 xh0, xi0, xi1 ;// of DC result 577 SHSUB16 xh1, xi0, xi1 578 579 ;// IStage 2 rows 0to3 x 1/2 580 SHSUB16 xg2, xh1, xh2 581 SHADD16 xg1, xh1, xh2 582 SHSUB16 xg3, xh0, xh3 583 SHADD16 xg0, xh0, xh3 584 585 ;// IStage 1 all rows 586 SHADD16 xf3, xg3, xg4 587 SHSUB16 xf4, xg3, xg4 588 SHADD16 xf2, xg2, xg5 589 SHSUB16 xf5, xg2, xg5 590 SHADD16 xf1, xg1, xg6 591 SHSUB16 xf6, xg1, xg6 592 SHADD16 xf0, xg0, xg7 593 SHSUB16 xf7, xg0, xg7 594 595 ;// Saturate 596 IF ("$outsize"="u8") 597 USAT16 xf0, #8, xf0 598 USAT16 xf1, #8, xf1 599 USAT16 xf2, #8, xf2 600 USAT16 xf3, #8, xf3 601 USAT16 xf4, #8, xf4 602 USAT16 xf5, #8, xf5 603 USAT16 xf6, #8, xf6 604 USAT16 xf7, #8, xf7 605 ENDIF 606 IF ("$outsize"="s9") 607 SSAT16 xf0, #9, xf0 608 SSAT16 xf1, #9, xf1 609 SSAT16 xf2, #9, xf2 610 SSAT16 xf3, #9, xf3 611 SSAT16 xf4, #9, xf4 612 SSAT16 xf5, #9, xf5 613 SSAT16 xf6, #9, xf6 614 SSAT16 xf7, #9, xf7 615 ENDIF 616 617 ;// Transpose to Row, Pack and store 618 IF ("$outsize"="u8") 619 ORR xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ] 620 ORR xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ] 621 ORR xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ] 622 ORR xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ] 623 PKHBT ra01, xf0, xf2, LSL #16 624 PKHTB rb01, xf2, xf0, ASR #16 625 PKHBT ra23, xf4, xf6, LSL #16 626 PKHTB rb23, xf6, xf4, ASR #16 627 STMIA pDest, {ra01, ra23} 628 IF "$stride"="s" 629 ADD pDest, pDest, pScale 630 STMIA pDest, {rb01, rb23} 631 ADD pDest, pDest, pScale 632 ELSE 633 ADD pDest, pDest, #($stride) 634 STMIA pDest, {rb01, rb23} 635 ADD pDest, pDest, #($stride) 636 ENDIF 637 ENDIF 638 IF ("$outsize"="s9"):LOR:("$outsize"="s16") 639 PKHBT ra01, xf0, xf1, LSL #16 640 PKHTB rb01, xf1, xf0, ASR #16 641 642 PKHBT ra23, xf2, xf3, LSL #16 643 PKHTB rb23, xf3, xf2, ASR #16 644 645 PKHBT ra45, xf4, xf5, LSL #16 646 PKHTB rb45, xf5, xf4, ASR #16 647 648 PKHBT ra67, xf6, xf7, LSL #16 649 PKHTB rb67, xf7, xf6, ASR #16 650 651 STMIA pDest, {ra01, ra23, ra45, ra67} 652 IF "$stride"="s" 653 ADD pDest, pDest, pScale 654 STMIA pDest, {rb01, rb23, rb45, rb67} 655 ADD pDest, pDest, pScale 656 ELSE 657 ADD pDest, pDest, #($stride) 658 STMIA pDest, {rb01, rb23, rb45, rb67} 659 ADD pDest, pDest, #($stride) 660 ENDIF 661 ENDIF 662 663 BCC v6_idct_row$_F 664 ENDIF ;// ARM1136JS 665 666 667 IF CortexA8 668 669Src0 EQU 7 670Src1 EQU 8 671Src2 EQU 9 672Src3 EQU 10 673Src4 EQU 11 674Src5 EQU 12 675Src6 EQU 13 676Src7 EQU 14 677Tmp EQU 15 678 679qXj0 QN Src0.S16 680qXj1 QN Src1.S16 681qXj2 QN Src2.S16 682qXj3 QN Src3.S16 683qXj4 QN Src4.S16 684qXj5 QN Src5.S16 685qXj6 QN Src6.S16 686qXj7 QN Src7.S16 687qXjt QN Tmp.S16 688 689dXj0lo DN (Src0*2).S16 690dXj0hi DN (Src0*2+1).S16 691dXj1lo DN (Src1*2).S16 692dXj1hi DN (Src1*2+1).S16 693dXj2lo DN (Src2*2).S16 694dXj2hi DN (Src2*2+1).S16 695dXj3lo DN (Src3*2).S16 696dXj3hi DN (Src3*2+1).S16 697dXj4lo DN (Src4*2).S16 698dXj4hi DN (Src4*2+1).S16 699dXj5lo DN (Src5*2).S16 700dXj5hi DN (Src5*2+1).S16 701dXj6lo DN (Src6*2).S16 702dXj6hi DN (Src6*2+1).S16 703dXj7lo DN (Src7*2).S16 704dXj7hi DN (Src7*2+1).S16 705dXjtlo DN (Tmp*2).S16 706dXjthi DN (Tmp*2+1).S16 707 708qXi0 QN qXj0 709qXi1 QN qXj4 710qXi2 QN qXj2 711qXi3 QN qXj7 712qXi4 QN qXj5 713qXi5 QN qXjt 714qXi6 QN qXj1 715qXi7 QN qXj6 716qXit QN qXj3 717 718dXi0lo DN dXj0lo 719dXi0hi DN dXj0hi 720dXi1lo DN dXj4lo 721dXi1hi DN dXj4hi 722dXi2lo DN dXj2lo 723dXi2hi DN dXj2hi 724dXi3lo DN dXj7lo 725dXi3hi DN dXj7hi 726dXi4lo DN dXj5lo 727dXi4hi DN dXj5hi 728dXi5lo DN dXjtlo 729dXi5hi DN dXjthi 730dXi6lo DN dXj1lo 731dXi6hi DN dXj1hi 732dXi7lo DN dXj6lo 733dXi7hi DN dXj6hi 734dXitlo DN dXj3lo 735dXithi DN dXj3hi 736 737qXh0 QN qXit 738qXh1 QN qXi0 739qXh2 QN qXi2 740qXh3 QN qXi3 741qXh4 QN qXi7 742qXh5 QN qXi5 743qXh6 QN qXi4 744qXh7 QN qXi1 745qXht QN qXi6 746 747dXh0lo DN dXitlo 748dXh0hi DN dXithi 749dXh1lo DN dXi0lo 750dXh1hi DN dXi0hi 751dXh2lo DN dXi2lo 752dXh2hi DN dXi2hi 753dXh3lo DN dXi3lo 754dXh3hi DN dXi3hi 755dXh4lo DN dXi7lo 756dXh4hi DN dXi7hi 757dXh5lo DN dXi5lo 758dXh5hi DN dXi5hi 759dXh6lo DN dXi4lo 760dXh6hi DN dXi4hi 761dXh7lo DN dXi1lo 762dXh7hi DN dXi1hi 763dXhtlo DN dXi6lo 764dXhthi DN dXi6hi 765 766qXg0 QN qXh2 767qXg1 QN qXht 768qXg2 QN qXh1 769qXg3 QN qXh0 770qXg4 QN qXh4 771qXg5 QN qXh5 772qXg6 QN qXh6 773qXg7 QN qXh7 774qXgt QN qXh3 775 776qXf0 QN qXg6 777qXf1 QN qXg5 778qXf2 QN qXg4 779qXf3 QN qXgt 780qXf4 QN qXg3 781qXf5 QN qXg2 782qXf6 QN qXg1 783qXf7 QN qXg0 784qXft QN qXg7 785 786 787qXt0 QN 1.S32 788qXt1 QN 2.S32 789qT0lo QN 1.S32 790qT0hi QN 2.S32 791qT1lo QN 3.S32 792qT1hi QN 4.S32 793qScalelo QN 5.S32 ;// used to read post scale values 794qScalehi QN 6.S32 795qTemp0 QN 5.S32 796qTemp1 QN 6.S32 797 798 799Scale1 EQU 6 800Scale2 EQU 15 801qScale1 QN Scale1.S16 802qScale2 QN Scale2.S16 803dScale1lo DN (Scale1*2).S16 804dScale1hi DN (Scale1*2+1).S16 805dScale2lo DN (Scale2*2).S16 806dScale2hi DN (Scale2*2+1).S16 807 808dCoefs DN 0.S16 ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]} 809InvSqrt2 DN dCoefs[0] ;// 1/sqrt(2) in Q15 810S DN dCoefs[1] ;// Sin(PI/8) in Q15 811C DN dCoefs[2] ;// Cos(PI/8) in Q15 812 813pTemp RN 12 814 815 816 IMPORT armCOMM_IDCTCoef 817 818 VLD1 {qXj0,qXj1}, [pSrc @64]! 819 VLD1 {qXj2,qXj3}, [pSrc @64]! 820 VLD1 {qXj4,qXj5}, [pSrc @64]! 821 VLD1 {qXj6,qXj7}, [pSrc @64]! 822 823 ;// Load PreScale and multiply with Src 824 ;// IStage 4 825 826 IF "$inscale"="s16" ;// 16X16 Mul 827 M_IDCT_PRESCALE16 828 ENDIF 829 830 IF "$inscale"="s32" ;// 32X32 ,ul 831 M_IDCT_PRESCALE32 832 ENDIF 833 834 ;// IStage 3 835 VQDMULH qXi2, qXi2, InvSqrt2 ;// i2/sqrt(2) 836 VHADD qXh0, qXi0, qXi1 ;// (i0+i1)/2 837 VHSUB qXh1, qXi0, qXi1 ;// (i0-i1)/2 838 VHADD qXh7, qXi5, qXi7 ;// (i5+i7)/4 839 VSUB qXh5, qXi5, qXi7 ;// (i5-i7)/2 840 VQDMULH qXh5, qXh5, InvSqrt2 ;// h5/sqrt(2) 841 VSUB qXh2, qXi2, qXi3 ;// h2, h3 842 843 VMULL qXt0, dXi4lo, C ;// c*i4 844 VMLAL qXt0, dXi6lo, S ;// c*i4+s*i6 845 VMULL qXt1, dXi4hi, C 846 VMLAL qXt1, dXi6hi, S 847 VSHRN dXh4lo, qXt0, #16 ;// h4 848 VSHRN dXh4hi, qXt1, #16 849 850 VMULL qXt0, dXi6lo, C ;// c*i6 851 VMLSL qXt0, dXi4lo, S ;// -s*i4 + c*h6 852 VMULL qXt1, dXi6hi, C 853 VMLSL qXt1, dXi4hi, S 854 VSHRN dXh6lo, qXt0, #16 ;// h6 855 VSHRN dXh6hi, qXt1, #16 856 857 ;// IStage 2 858 VSUB qXg6, qXh6, qXh7 859 VSUB qXg5, qXh5, qXg6 860 VSUB qXg4, qXh4, qXg5 861 VHADD qXg1, qXh1, qXh2 ;// (h1+h2)/2 862 VHSUB qXg2, qXh1, qXh2 ;// (h1-h2)/2 863 VHADD qXg0, qXh0, qXh3 ;// (h0+h3)/2 864 VHSUB qXg3, qXh0, qXh3 ;// (h0-h3)/2 865 866 ;// IStage 1 all rows 867 VADD qXf3, qXg3, qXg4 868 VSUB qXf4, qXg3, qXg4 869 VADD qXf2, qXg2, qXg5 870 VSUB qXf5, qXg2, qXg5 871 VADD qXf1, qXg1, qXg6 872 VSUB qXf6, qXg1, qXg6 873 VADD qXf0, qXg0, qXg7 874 VSUB qXf7, qXg0, qXg7 875 876 ;// Transpose, store and loop 877XTR0 EQU Src5 878XTR1 EQU Tmp 879XTR2 EQU Src6 880XTR3 EQU Src7 881XTR4 EQU Src3 882XTR5 EQU Src0 883XTR6 EQU Src1 884XTR7 EQU Src2 885XTRt EQU Src4 886 887qA0 QN XTR0.S32 ;// for XTRpose 888qA1 QN XTR1.S32 889qA2 QN XTR2.S32 890qA3 QN XTR3.S32 891qA4 QN XTR4.S32 892qA5 QN XTR5.S32 893qA6 QN XTR6.S32 894qA7 QN XTR7.S32 895 896dB0 DN XTR0*2+1 ;// for using VSWP 897dB1 DN XTR1*2+1 898dB2 DN XTR2*2+1 899dB3 DN XTR3*2+1 900dB4 DN XTR4*2 901dB5 DN XTR5*2 902dB6 DN XTR6*2 903dB7 DN XTR7*2 904 905 906 VTRN qXf0, qXf1 907 VTRN qXf2, qXf3 908 VTRN qXf4, qXf5 909 VTRN qXf6, qXf7 910 VTRN qA0, qA2 911 VTRN qA1, qA3 912 VTRN qA4, qA6 913 VTRN qA5, qA7 914 VSWP dB0, dB4 915 VSWP dB1, dB5 916 VSWP dB2, dB6 917 VSWP dB3, dB7 918 919 920qYj0 QN qXf0 921qYj1 QN qXf1 922qYj2 QN qXf2 923qYj3 QN qXf3 924qYj4 QN qXf4 925qYj5 QN qXf5 926qYj6 QN qXf6 927qYj7 QN qXf7 928qYjt QN qXft 929 930dYj0lo DN (XTR0*2).S16 931dYj0hi DN (XTR0*2+1).S16 932dYj1lo DN (XTR1*2).S16 933dYj1hi DN (XTR1*2+1).S16 934dYj2lo DN (XTR2*2).S16 935dYj2hi DN (XTR2*2+1).S16 936dYj3lo DN (XTR3*2).S16 937dYj3hi DN (XTR3*2+1).S16 938dYj4lo DN (XTR4*2).S16 939dYj4hi DN (XTR4*2+1).S16 940dYj5lo DN (XTR5*2).S16 941dYj5hi DN (XTR5*2+1).S16 942dYj6lo DN (XTR6*2).S16 943dYj6hi DN (XTR6*2+1).S16 944dYj7lo DN (XTR7*2).S16 945dYj7hi DN (XTR7*2+1).S16 946dYjtlo DN (XTRt*2).S16 947dYjthi DN (XTRt*2+1).S16 948 949qYi0 QN qYj0 950qYi1 QN qYj4 951qYi2 QN qYj2 952qYi3 QN qYj7 953qYi4 QN qYj5 954qYi5 QN qYjt 955qYi6 QN qYj1 956qYi7 QN qYj6 957qYit QN qYj3 958 959dYi0lo DN dYj0lo 960dYi0hi DN dYj0hi 961dYi1lo DN dYj4lo 962dYi1hi DN dYj4hi 963dYi2lo DN dYj2lo 964dYi2hi DN dYj2hi 965dYi3lo DN dYj7lo 966dYi3hi DN dYj7hi 967dYi4lo DN dYj5lo 968dYi4hi DN dYj5hi 969dYi5lo DN dYjtlo 970dYi5hi DN dYjthi 971dYi6lo DN dYj1lo 972dYi6hi DN dYj1hi 973dYi7lo DN dYj6lo 974dYi7hi DN dYj6hi 975dYitlo DN dYj3lo 976dYithi DN dYj3hi 977 978qYh0 QN qYit 979qYh1 QN qYi0 980qYh2 QN qYi2 981qYh3 QN qYi3 982qYh4 QN qYi7 983qYh5 QN qYi5 984qYh6 QN qYi4 985qYh7 QN qYi1 986qYht QN qYi6 987 988dYh0lo DN dYitlo 989dYh0hi DN dYithi 990dYh1lo DN dYi0lo 991dYh1hi DN dYi0hi 992dYh2lo DN dYi2lo 993dYh2hi DN dYi2hi 994dYh3lo DN dYi3lo 995dYh3hi DN dYi3hi 996dYh4lo DN dYi7lo 997dYh4hi DN dYi7hi 998dYh5lo DN dYi5lo 999dYh5hi DN dYi5hi 1000dYh6lo DN dYi4lo 1001dYh6hi DN dYi4hi 1002dYh7lo DN dYi1lo 1003dYh7hi DN dYi1hi 1004dYhtlo DN dYi6lo 1005dYhthi DN dYi6hi 1006 1007qYg0 QN qYh2 1008qYg1 QN qYht 1009qYg2 QN qYh1 1010qYg3 QN qYh0 1011qYg4 QN qYh4 1012qYg5 QN qYh5 1013qYg6 QN qYh6 1014qYg7 QN qYh7 1015qYgt QN qYh3 1016 1017qYf0 QN qYg6 1018qYf1 QN qYg5 1019qYf2 QN qYg4 1020qYf3 QN qYgt 1021qYf4 QN qYg3 1022qYf5 QN qYg2 1023qYf6 QN qYg1 1024qYf7 QN qYg0 1025qYft QN qYg7 1026 1027 VRSHR qYj7, qYj7, #2 1028 VRSHR qYj6, qYj6, #1 1029 1030 VHADD qYi5, qYj1, qYj7 ;// i5 = (j1+j7)/2 1031 VSUB qYi6, qYj1, qYj7 ;// i6 = j1-j7 1032 VHADD qYi3, qYj2, qYj6 ;// i3 = (j2+j6)/2 1033 VSUB qYi2, qYj2, qYj6 ;// i2 = j2-j6 1034 VHADD qYi7, qYj5, qYj3 ;// i7 = (j5+j3)/2 1035 VSUB qYi4, qYj5, qYj3 ;// i4 = j5-j3 1036 1037 VQDMULH qYi2, qYi2, InvSqrt2 ;// i2/sqrt(2) 1038 ;// IStage 4,3 rows 0to1 x 1/2 1039 1040 MOV pTemp, #0x4 ;// ensure correct round 1041 VDUP qScale1, pTemp ;// of DC result 1042 VADD qYi0, qYi0, qScale1 1043 1044 VHADD qYh0, qYi0, qYi1 ;// (i0+i1)/2 1045 VHSUB qYh1, qYi0, qYi1 ;// (i0-i1)/2 1046 1047 VHADD qYh7, qYi5, qYi7 ;// (i5+i7)/4 1048 VSUB qYh5, qYi5, qYi7 ;// (i5-i7)/2 1049 VSUB qYh2, qYi2, qYi3 ;// h2, h3 1050 VQDMULH qYh5, qYh5, InvSqrt2 ;// h5/sqrt(2) 1051 1052 VMULL qXt0, dYi4lo, C ;// c*i4 1053 VMLAL qXt0, dYi6lo, S ;// c*i4+s*i6 1054 VMULL qXt1, dYi4hi, C 1055 VMLAL qXt1, dYi6hi, S 1056 VSHRN dYh4lo, qXt0, #16 ;// h4 1057 VSHRN dYh4hi, qXt1, #16 1058 1059 VMULL qXt0, dYi6lo, C ;// c*i6 1060 VMLSL qXt0, dYi4lo, S ;// -s*i4 + c*h6 1061 VMULL qXt1, dYi6hi, C 1062 VMLSL qXt1, dYi4hi, S 1063 VSHRN dYh6lo, qXt0, #16 ;// h6 1064 VSHRN dYh6hi, qXt1, #16 1065 1066 VSUB qYg6, qYh6, qYh7 1067 VSUB qYg5, qYh5, qYg6 1068 VSUB qYg4, qYh4, qYg5 1069 1070 ;// IStage 2 rows 0to3 x 1/2 1071 VHADD qYg1, qYh1, qYh2 ;// (h1+h2)/2 1072 VHSUB qYg2, qYh1, qYh2 ;// (h1-h2)/2 1073 VHADD qYg0, qYh0, qYh3 ;// (h0+h3)/2 1074 VHSUB qYg3, qYh0, qYh3 ;// (h0-h3)/2 1075 1076 1077 ;// IStage 1 all rows 1078 VHADD qYf3, qYg3, qYg4 1079 VHSUB qYf4, qYg3, qYg4 1080 VHADD qYf2, qYg2, qYg5 1081 VHSUB qYf5, qYg2, qYg5 1082 VHADD qYf1, qYg1, qYg6 1083 VHSUB qYf6, qYg1, qYg6 1084 VHADD qYf0, qYg0, qYg7 1085 VHSUB qYf7, qYg0, qYg7 1086 1087YTR0 EQU Src0 1088YTR1 EQU Src4 1089YTR2 EQU Src1 1090YTR3 EQU Src2 1091YTR4 EQU Src7 1092YTR5 EQU Src5 1093YTR6 EQU Tmp 1094YTR7 EQU Src6 1095YTRt EQU Src3 1096 1097qC0 QN YTR0.S32 ;// for YTRpose 1098qC1 QN YTR1.S32 1099qC2 QN YTR2.S32 1100qC3 QN YTR3.S32 1101qC4 QN YTR4.S32 1102qC5 QN YTR5.S32 1103qC6 QN YTR6.S32 1104qC7 QN YTR7.S32 1105 1106dD0 DN YTR0*2+1 ;// for using VSWP 1107dD1 DN YTR1*2+1 1108dD2 DN YTR2*2+1 1109dD3 DN YTR3*2+1 1110dD4 DN YTR4*2 1111dD5 DN YTR5*2 1112dD6 DN YTR6*2 1113dD7 DN YTR7*2 1114 1115 VTRN qYf0, qYf1 1116 VTRN qYf2, qYf3 1117 VTRN qYf4, qYf5 1118 VTRN qYf6, qYf7 1119 VTRN qC0, qC2 1120 VTRN qC1, qC3 1121 VTRN qC4, qC6 1122 VTRN qC5, qC7 1123 VSWP dD0, dD4 1124 VSWP dD1, dD5 1125 VSWP dD2, dD6 1126 VSWP dD3, dD7 1127 1128 1129dYf0U8 DN YTR0*2.U8 1130dYf1U8 DN YTR1*2.U8 1131dYf2U8 DN YTR2*2.U8 1132dYf3U8 DN YTR3*2.U8 1133dYf4U8 DN YTR4*2.U8 1134dYf5U8 DN YTR5*2.U8 1135dYf6U8 DN YTR6*2.U8 1136dYf7U8 DN YTR7*2.U8 1137 1138 ;// 1139 ;// Do saturation if outsize is other than S16 1140 ;// 1141 1142 IF ("$outsize"="u8") 1143 ;// Output range [0-255] 1144 VQMOVN dYf0U8, qYf0 1145 VQMOVN dYf1U8, qYf1 1146 VQMOVN dYf2U8, qYf2 1147 VQMOVN dYf3U8, qYf3 1148 VQMOVN dYf4U8, qYf4 1149 VQMOVN dYf5U8, qYf5 1150 VQMOVN dYf6U8, qYf6 1151 VQMOVN dYf7U8, qYf7 1152 ENDIF 1153 1154 IF ("$outsize"="s9") 1155 ;// Output range [-256 to +255] 1156 VQSHL qYf0, qYf0, #16-9 1157 VQSHL qYf1, qYf1, #16-9 1158 VQSHL qYf2, qYf2, #16-9 1159 VQSHL qYf3, qYf3, #16-9 1160 VQSHL qYf4, qYf4, #16-9 1161 VQSHL qYf5, qYf5, #16-9 1162 VQSHL qYf6, qYf6, #16-9 1163 VQSHL qYf7, qYf7, #16-9 1164 1165 VSHR qYf0, qYf0, #16-9 1166 VSHR qYf1, qYf1, #16-9 1167 VSHR qYf2, qYf2, #16-9 1168 VSHR qYf3, qYf3, #16-9 1169 VSHR qYf4, qYf4, #16-9 1170 VSHR qYf5, qYf5, #16-9 1171 VSHR qYf6, qYf6, #16-9 1172 VSHR qYf7, qYf7, #16-9 1173 ENDIF 1174 1175 ;// Store output depending on the Stride size 1176 IF "$stride"="s" 1177 VST1 qYf0, [pDest @64], Stride 1178 VST1 qYf1, [pDest @64], Stride 1179 VST1 qYf2, [pDest @64], Stride 1180 VST1 qYf3, [pDest @64], Stride 1181 VST1 qYf4, [pDest @64], Stride 1182 VST1 qYf5, [pDest @64], Stride 1183 VST1 qYf6, [pDest @64], Stride 1184 VST1 qYf7, [pDest @64] 1185 ELSE 1186 IF ("$outsize"="u8") 1187 VST1 dYf0U8, [pDest @64], #8 1188 VST1 dYf1U8, [pDest @64], #8 1189 VST1 dYf2U8, [pDest @64], #8 1190 VST1 dYf3U8, [pDest @64], #8 1191 VST1 dYf4U8, [pDest @64], #8 1192 VST1 dYf5U8, [pDest @64], #8 1193 VST1 dYf6U8, [pDest @64], #8 1194 VST1 dYf7U8, [pDest @64] 1195 ELSE 1196 ;// ("$outsize"="s9") or ("$outsize"="s16") 1197 VST1 qYf0, [pDest @64], #16 1198 VST1 qYf1, [pDest @64], #16 1199 VST1 qYf2, [pDest @64], #16 1200 VST1 qYf3, [pDest @64], #16 1201 VST1 qYf4, [pDest @64], #16 1202 VST1 qYf5, [pDest @64], #16 1203 VST1 qYf6, [pDest @64], #16 1204 VST1 qYf7, [pDest @64] 1205 ENDIF 1206 1207 ENDIF 1208 1209 1210 1211 ENDIF ;// CortexA8 1212 1213 1214 1215 MEND 1216 1217 ;// Scale TWO input rows with TWO rows of 16 bit scale values 1218 ;// 1219 ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row 1220 ;// input (Eight input values) with one row of scale values. Also 1221 ;// Loads next scale values from pScale, if $LastRow flag is not set. 1222 ;// 1223 ;// Input Registers: 1224 ;// 1225 ;// $dAlo - Input D register with first four S16 values of row n 1226 ;// $dAhi - Input D register with next four S16 values of row n 1227 ;// $dBlo - Input D register with first four S16 values of row n+1 1228 ;// $dBhi - Input D register with next four S16 values of row n+1 1229 ;// pScale - Pointer to next row of scale values 1230 ;// qT0lo - Temporary scratch register 1231 ;// qT0hi - Temporary scratch register 1232 ;// qT1lo - Temporary scratch register 1233 ;// qT1hi - Temporary scratch register 1234 ;// dScale1lo - Scale value of row n 1235 ;// dScale1hi - Scale value of row n 1236 ;// dScale2lo - Scale value of row n+1 1237 ;// dScale2hi - Scale value of row n+1 1238 ;// 1239 ;// Input Flag 1240 ;// 1241 ;// $LastRow - Flag to indicate whether current row is last row 1242 ;// 1243 ;// Output Registers: 1244 ;// 1245 ;// $dAlo - Scaled output values (first four S16 of row n) 1246 ;// $dAhi - Scaled output values (next four S16 of row n) 1247 ;// $dBlo - Scaled output values (first four S16 of row n+1) 1248 ;// $dBhi - Scaled output values (next four S16 of row n+1) 1249 ;// qScale1 - Scale values for next row 1250 ;// qScale2 - Scale values for next row+1 1251 ;// pScale - Pointer to next row of scale values 1252 ;// 1253 MACRO 1254 M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow 1255 VMULL qT0lo, $dAlo, dScale1lo 1256 VMULL qT0hi, $dAhi, dScale1hi 1257 VMULL qT1lo, $dBlo, dScale2lo 1258 VMULL qT1hi, $dBhi, dScale2hi 1259 IF "$LastRow"="0" 1260 VLD1 qScale1, [pScale], #16 ;// Load scale for row n+1 1261 VLD1 qScale2, [pScale], #16 ;// Load scale for row n+2 1262 ENDIF 1263 VQRSHRN $dAlo, qT0lo, #12 1264 VQRSHRN $dAhi, qT0hi, #12 1265 VQRSHRN $dBlo, qT1lo, #12 1266 VQRSHRN $dBhi, qT1hi, #12 1267 MEND 1268 1269 ;// Scale 8x8 block input values with 16 bit scale values 1270 ;// 1271 ;// This macro is used to pre-scale block of 8x8 input. 1272 ;// This also do the Ist stage transformations of IDCT. 1273 ;// 1274 ;// Input Registers: 1275 ;// 1276 ;// dXjnlo - n th input D register with first four S16 values 1277 ;// dXjnhi - n th input D register with next four S16 values 1278 ;// qXjn - n th input Q register with eight S16 values 1279 ;// pScale - Pointer to scale values 1280 ;// 1281 ;// Output Registers: 1282 ;// 1283 ;// qXin - n th output Q register with eight S16 output values of 1st stage 1284 ;// 1285 MACRO 1286 M_IDCT_PRESCALE16 1287 VLD1 qScale1, [pScale], #16 ;// Load Pre scale for row 0 1288 VLD1 qScale2, [pScale], #16 ;// Load Pre scale for row 0 1289 M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0 ;// Pre scale row 0 & 1 1290 M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0 1291 M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0 1292 M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1 1293 VHADD qXi5, qXj1, qXj7 ;// (j1+j7)/2 1294 VSUB qXi6, qXj1, qXj7 ;// j1-j7 1295 LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants 1296 VHADD qXi3, qXj2, qXj6 ;// (j2+j6)/2 1297 VSUB qXi2, qXj2, qXj6 ;// j2-j6 1298 VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants 1299 VHADD qXi7, qXj5, qXj3 ;// (j5+j3)/2 1300 VSUB qXi4, qXj5, qXj3 ;// j5-j3 1301 MEND 1302 1303 1304 ;// Scale 8x8 block input values with 32 bit scale values 1305 ;// 1306 ;// This macro is used to pre-scale block of 8x8 input. 1307 ;// This also do the Ist stage transformations of IDCT. 1308 ;// 1309 ;// Input Registers: 1310 ;// 1311 ;// dXjnlo - n th input D register with first four S16 values 1312 ;// dXjnhi - n th input D register with next four S16 values 1313 ;// qXjn - n th input Q register with eight S16 values 1314 ;// pScale - Pointer to 32bit scale values in Q23 format 1315 ;// 1316 ;// Output Registers: 1317 ;// 1318 ;// dXinlo - n th output D register with first four S16 output values of 1st stage 1319 ;// dXinhi - n th output D register with next four S16 output values of 1st stage 1320 ;// 1321 MACRO 1322 M_IDCT_PRESCALE32 1323qScale0lo QN 0.S32 1324qScale0hi QN 1.S32 1325qScale1lo QN 2.S32 1326qScale1hi QN 3.S32 1327qScale2lo QN qScale1lo 1328qScale2hi QN qScale1hi 1329qScale3lo QN qScale1lo 1330qScale3hi QN qScale1hi 1331qScale4lo QN qScale1lo 1332qScale4hi QN qScale1hi 1333qScale5lo QN qScale0lo 1334qScale5hi QN qScale0hi 1335qScale6lo QN qScale0lo 1336qScale6hi QN qScale0hi 1337qScale7lo QN qScale0lo 1338qScale7hi QN qScale0hi 1339 1340qSrc0lo QN 4.S32 1341qSrc0hi QN 5.S32 1342qSrc1lo QN 6.S32 1343qSrc1hi QN Src4.S32 1344qSrc2lo QN qSrc0lo 1345qSrc2hi QN qSrc0hi 1346qSrc3lo QN qSrc0lo 1347qSrc3hi QN qSrc0hi 1348qSrc4lo QN qSrc0lo 1349qSrc4hi QN qSrc0hi 1350qSrc5lo QN qSrc1lo 1351qSrc5hi QN qSrc1hi 1352qSrc6lo QN qSrc1lo 1353qSrc6hi QN qSrc1hi 1354qSrc7lo QN qSrc0lo 1355qSrc7hi QN qSrc0hi 1356 1357qRes17lo QN qScale0lo 1358qRes17hi QN qScale0hi 1359qRes26lo QN qScale0lo 1360qRes26hi QN qScale0hi 1361qRes53lo QN qScale0lo 1362qRes53hi QN qScale0hi 1363 1364 ADD pTemp, pScale, #4*8*7 ;// Address of pScale[7] 1365 1366 ;// Row 0 1367 VLD1 {qScale0lo, qScale0hi}, [pScale]! 1368 VSHLL qSrc0lo, dXj0lo, #(12-1) 1369 VSHLL qSrc0hi, dXj0hi, #(12-1) 1370 VLD1 {qScale1lo, qScale1hi}, [pScale]! 1371 VQRDMULH qSrc0lo, qScale0lo, qSrc0lo 1372 VQRDMULH qSrc0hi, qScale0hi, qSrc0hi 1373 VLD1 {qScale7lo, qScale7hi}, [pTemp]! 1374 VSHLL qSrc1lo, dXj1lo, #(12-1) 1375 VSHLL qSrc1hi, dXj1hi, #(12-1) 1376 VMOVN dXi0lo, qSrc0lo ;// Output i0 1377 VMOVN dXi0hi, qSrc0hi 1378 VSHLL qSrc7lo, dXj7lo, #(12-1) 1379 VSHLL qSrc7hi, dXj7hi, #(12-1) 1380 SUB pTemp, pTemp, #((16*2)+(4*8*1)) 1381 VQRDMULH qSrc1lo, qScale1lo, qSrc1lo 1382 VQRDMULH qSrc1hi, qScale1hi, qSrc1hi 1383 VQRDMULH qSrc7lo, qScale7lo, qSrc7lo 1384 VQRDMULH qSrc7hi, qScale7hi, qSrc7hi 1385 VLD1 {qScale2lo, qScale2hi}, [pScale]! 1386 1387 ;// Row 1 & 7 1388 VHADD qRes17lo, qSrc1lo, qSrc7lo ;// (j1+j7)/2 1389 VHADD qRes17hi, qSrc1hi, qSrc7hi ;// (j1+j7)/2 1390 VMOVN dXi5lo, qRes17lo ;// Output i5 1391 VMOVN dXi5hi, qRes17hi 1392 VSUB qRes17lo, qSrc1lo, qSrc7lo ;// j1-j7 1393 VSUB qRes17hi, qSrc1hi, qSrc7hi ;// j1-j7 1394 VMOVN dXi6lo, qRes17lo ;// Output i6 1395 VMOVN dXi6hi, qRes17hi 1396 VSHLL qSrc2lo, dXj2lo, #(12-1) 1397 VSHLL qSrc2hi, dXj2hi, #(12-1) 1398 VLD1 {qScale6lo, qScale6hi}, [pTemp]! 1399 VSHLL qSrc6lo, dXj6lo, #(12-1) 1400 VSHLL qSrc6hi, dXj6hi, #(12-1) 1401 SUB pTemp, pTemp, #((16*2)+(4*8*1)) 1402 VQRDMULH qSrc2lo, qScale2lo, qSrc2lo 1403 VQRDMULH qSrc2hi, qScale2hi, qSrc2hi 1404 VQRDMULH qSrc6lo, qScale6lo, qSrc6lo 1405 VQRDMULH qSrc6hi, qScale6hi, qSrc6hi 1406 VLD1 {qScale3lo, qScale3hi}, [pScale]! 1407 1408 ;// Row 2 & 6 1409 VHADD qRes26lo, qSrc2lo, qSrc6lo ;// (j2+j6)/2 1410 VHADD qRes26hi, qSrc2hi, qSrc6hi ;// (j2+j6)/2 1411 VMOVN dXi3lo, qRes26lo ;// Output i3 1412 VMOVN dXi3hi, qRes26hi 1413 VSUB qRes26lo, qSrc2lo, qSrc6lo ;// j2-j6 1414 VSUB qRes26hi, qSrc2hi, qSrc6hi ;// j2-j6 1415 VMOVN dXi2lo, qRes26lo ;// Output i2 1416 VMOVN dXi2hi, qRes26hi 1417 VSHLL qSrc3lo, dXj3lo, #(12-1) 1418 VSHLL qSrc3hi, dXj3hi, #(12-1) 1419 VLD1 {qScale5lo, qScale5hi}, [pTemp]! 1420 VSHLL qSrc5lo, dXj5lo, #(12-1) 1421 VSHLL qSrc5hi, dXj5hi, #(12-1) 1422 VQRDMULH qSrc3lo, qScale3lo, qSrc3lo 1423 VQRDMULH qSrc3hi, qScale3hi, qSrc3hi 1424 VQRDMULH qSrc5lo, qScale5lo, qSrc5lo 1425 VQRDMULH qSrc5hi, qScale5hi, qSrc5hi 1426 1427 ;// Row 3 & 5 1428 VHADD qRes53lo, qSrc5lo, qSrc3lo ;// (j5+j3)/2 1429 VHADD qRes53hi, qSrc5hi, qSrc3hi ;// (j5+j3)/2 1430 SUB pSrc, pSrc, #16*2*2 1431 VMOVN dXi7lo, qRes53lo ;// Output i7 1432 VMOVN dXi7hi, qRes53hi 1433 VSUB qRes53lo, qSrc5lo, qSrc3lo ;// j5-j3 1434 VSUB qRes53hi, qSrc5hi, qSrc3hi ;// j5-j3 1435 VLD1 qXj4, [pSrc @64] 1436 VMOVN dXi4lo, qRes53lo ;// Output i4 1437 VMOVN dXi4hi, qRes53hi 1438 VSHLL qSrc4lo, dXj4lo, #(12-1) 1439 VSHLL qSrc4hi, dXj4hi, #(12-1) 1440 VLD1 {qScale4lo, qScale4hi}, [pScale] 1441 LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants 1442 VQRDMULH qSrc4lo, qScale4lo, qSrc4lo 1443 VQRDMULH qSrc4hi, qScale4hi, qSrc4hi 1444 VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants 1445 ;// Row 4 1446 VMOVN dXi1lo, qSrc4lo ;// Output i1 1447 VMOVN dXi1hi, qSrc4hi 1448 1449 MEND 1450 1451 END 1452