ih264_resi_trans_quant_a9.s revision 8d3d303c7942ced6a987a52db8977d768dc3605f
1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2015 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20@/** 21@******************************************************************************* 22@* @file 23@* ih264_resi_trans_quant_a9.s 24@* 25@* @brief 26@* Contains function definitions for residual and forward trans 27@* 28@* @author 29@* Ittiam 30@* 31@* @par List of Functions: 32@* ih264_resi_trans_quant_4x4_a9 33@* ih264_resi_trans_quant_8x8_a9 34@* ih264_resi_trans_quant_chroma_4x4_a9 35@* ih264_hadamard_quant_4x4_a9 36@* ih264_hadamard_quant_2x2_uv_a9 37@* 38@* @remarks 39@* None 40@* 41@******************************************************************************* 42 43 44.text 45.p2align 2 46@***************************************************************************** 47@* 48@* Function Name : ih264_resi_trans_quant_4x4_a9 49@* Description : This function does cf4 of H264 50@* 51@* Arguments : R0 :pointer to src buffer 52@ R1 :pointer to pred buffer 53@ R2 :pointer to dst buffer 54@ R3 :source stride 55@ STACK : pred stride, 56@ dst stride, 57@ pointer to scaling matrix, 58@ pointer to threshold matrix, 59@ qbits, 60@ rounding factor, 61@ pointer to store nnz 62@ pointer to store non quantized dc value 63@ Values Returned : NONE 64@ 65@ Register Usage : 66@ Stack Usage : 40 bytes 67@ Cycles : Around 68@ Interruptiaility : Interruptable 69@ 70@ Known Limitations 71@ \Assumptions : 72@ 73@ Revision History : 74@ DD MM YYYY Author(s) Changes 75@ 1 12 2013 100633 First version 76@ 20 1 2014 100633 Changes the API, Optimization 77@ 78@***************************************************************************** 79 80 .global ih264_resi_trans_quant_4x4_a9 81ih264_resi_trans_quant_4x4_a9: 82 83 @R0 :pointer to src buffer 84 @R1 :pointer to pred buffer 85 @R2 :pointer to dst buffer 86 @R3 :Source stride 87 @STACk :pred stride 88 @ :scale matirx, 89 @ :threshold matrix 90 @ :qbits 91 @ :round factor 92 @ :nnz 93 94 push {r4-r12, lr} @push all the variables first 95 96 add r11, sp, #40 @decrement stack pointer,to accomodate two variables 97 ldmfd r11, {r4-r10} @load the strides into registers 98 99 @R0 :pointer to src buffer 100 @R1 :pointer to pred buffer 101 @R2 :pointer to dst buffer 102 @R3 :Source stride 103 @R4 :Pred stride 104 @R5 :scale matirx, 105 @R6 :threshold matrix 106 @R7 :qbits 107 @R8 :round factor 108 @R9 :nnz 109 110 vpush {d8-d15} 111 112 mov r11, #0 113 sub r7, r11, r7 @Negate the qbit value for usiing LSL 114 115 @------------Fucntion Loading done----------------; 116 117 vld1.u8 d30, [r0], r3 @load first 8 pix src row 1 118 119 vld1.u8 d31, [r1], r4 @load first 8 pix pred row 1 120 121 vld1.u8 d28, [r0], r3 @load first 8 pix src row 2 122 123 vld1.u8 d29, [r1], r4 @load first 8 pix pred row 2 124 125 vld1.u8 d26, [r0], r3 @load first 8 pix src row 3 126 127 vld1.u8 d27, [r1], r4 @load first 8 pix pred row 3 128 vsubl.u8 q0, d30, d31 @find residue row 1 129 130 vld1.u8 d24, [r0], r3 @load first 8 pix src row 4 131 132 vld1.u8 d25, [r1], r4 @load first 8 pix pred row 4 133 vsubl.u8 q1, d28, d29 @find residue row 2 134 135 vsubl.u8 q2, d26, d27 @find residue row 3 136 vsubl.u8 q3, d24, d25 @find residue row 4 137 138 vtrn.16 d0, d2 @T12 139 vtrn.16 d4, d6 @T23 140 vtrn.32 d0, d4 @T13 141 vtrn.32 d2, d6 @T14 142 143 vadd.s16 d8 , d0, d6 @x0 = x4+x7 144 vadd.s16 d9 , d2, d4 @x1 = x5+x6 145 vsub.s16 d10, d2, d4 @x2 = x5-x6 146 vsub.s16 d11, d0, d6 @x3 = x4-x7 147 148 vshl.s16 d12, d10, #1 @U_SHIFT(x2,1,shft) 149 vshl.s16 d13, d11, #1 @U_SHIFT(x3,1,shft) 150 151 vadd.s16 d14, d8, d9 @x4 = x0 + x1; 152 vsub.s16 d16, d8, d9 @x6 = x0 - x1; 153 vadd.s16 d15, d13, d10 @x5 = U_SHIFT(x3,1,shft) + x2; 154 vsub.s16 d17, d11, d12 @x7 = x3 - U_SHIFT(x2,1,shft); 155 156 @taking transpose again so as to make do vert transform 157 vtrn.16 d14, d15 @T12 158 vtrn.16 d16, d17 @T23 159 vtrn.32 d14, d16 @T13 160 vtrn.32 d15, d17 @T24 161 162 @let us do vertical transform 163 @same code as horiz 164 vadd.s16 d18, d14, d17 @x0 = x4+x7 165 vadd.s16 d19, d15, d16 @x1 = x5+x6 166 vsub.s16 d20, d15, d16 @x2 = x5-x6 167 vsub.s16 d21, d14, d17 @x3 = x4-x7 168 169 vshl.s16 d22, d20, #1 @U_SHIFT(x2,1,shft) 170 vshl.s16 d23, d21, #1 @U_SHIFT(x3,1,shft) 171 172 vdup.s32 q4, r8 @Load rounding value row 1 173 174 vadd.s16 d24, d18, d19 @x5 = x0 + x1; 175 vsub.s16 d26, d18, d19 @x7 = x0 - x1; 176 vadd.s16 d25, d23, d20 @x6 = U_SHIFT(x3,1,shft) + x2; 177 vsub.s16 d27, d21, d22 @x8 = x3 - U_SHIFT(x2,1,shft); 178 vdup.s32 q10, r7 @Load qbit values 179 180 vst1.s16 d24[0], [r10] @Store the dc value to alternate dc sddress 181 182@core tranform is done for 4x8 block 1 183 vld1.s16 {q14-q15}, [r5] @load the scaling values 184 185 vabs.s16 q0, q12 @Abs val of row 1 blk 1 186 187 vabs.s16 q1, q13 @Abs val of row 2 blk 1 188 189 vmov.s32 q5, q4 @copy round fact for row 2 190 191 vmov.s32 q6, q4 @copy round fact for row 2 192 vclt.s16 q2, q12, #0 @Get the sign of row 1 blk 1 193 194 vmov.s32 q7, q4 @copy round fact for row 2 195 vclt.s16 q3, q13, #0 @Get the sign of row 2 blk 1 196 197 vmlal.s16 q4, d0, d28 @Multiply and add row 1 198 vmlal.s16 q5, d1, d29 @Multiply and add row 2 199 vmlal.s16 q6, d2, d30 @Multiply and add row 3 200 vmlal.s16 q7, d3, d31 @Multiply and add row 4 201 202 vshl.s32 q11, q4, q10 @Shift row 1 203 vshl.s32 q12, q5, q10 @Shift row 2 204 vshl.s32 q13, q6, q10 @Shift row 3 205 vshl.s32 q14, q7, q10 @Shift row 4 206 207 vmovn.s32 d30, q11 @Narrow row 1 208 vmovn.s32 d31, q12 @Narrow row 2 209 vmovn.s32 d0 , q13 @Narrow row 3 210 vmovn.s32 d1 , q14 @Narrow row 4 211 212 vneg.s16 q1, q15 @Get negative 213 vneg.s16 q4, q0 @Get negative 214 215 vceq.s16 q5, q15, #0 @I compare with zero row 1 and 2 blk 1 216 vceq.s16 q6, q0 , #0 @I compare with zero row 1 and 2 blk 1 217 218 vbsl.s16 q2, q1, q15 @Restore sign of row 1 and 2 219 vbsl.s16 q3, q4, q0 @Restore sign of row 3 and 4 220 221 222 vmovn.u16 d14, q5 @I Narrow the comparison for row 1 and 2 blk 1 223 vmovn.u16 d15, q6 @I Narrow the comparison for row 1 and 2 blk 2 224 225 vshr.u8 q8, q7, #7 @I Reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] 226 227 vpadd.u8 d18, d16, d17 @I pair add nnz 1 228 vpadd.u8 d20, d18, d19 @I Pair add nnz 2 229 vpadd.u8 d22, d20, d21 @I Pair add nnz 3 230 vpadd.u8 d24, d22, d23 @I Pair add nnz4 231 vst1.s16 {q2-q3}, [r2] @Store blk 232 233 vmov.u8 d25, #16 @I Get max nnz 234 vsub.u8 d26, d25, d24 @I invert current nnz 235 236 vst1.u8 d26[0], [r9] @I Write nnz 237 238 vpop {d8-d15} 239 pop {r4-r12, pc} 240 241 242 243@***************************************************************************** 244@* 245@* Function Name : ih264_resi_trans_quant_chroma_4x4_a9 246@* Description : This function does residue calculation, forward transform 247@* and quantization for 4x4 chroma block. 248@* 249@* Arguments : R0 :pointer to src buffer 250@ R1 :pointer to pred buffer 251@ R2 :pointer to dst buffer 252@ R3 :source stride 253@ STACK : pred stride, 254@ dst stride, 255@ pointer to scaling matrix, 256@ pointer to threshold matrix, 257@ qbits, 258@ rounding factor, 259@ pointer to store nnz 260@ pointer to store unquantized dc values 261@ Values Returned : NONE 262@ 263@ Register Usage : 264@ Stack Usage : 40 bytes 265@ Cycles : Around 266@ Interruptiaility : Interruptable 267@ 268@ Known Limitations 269@ \Assumptions : 270@ 271@ Revision History : 272@ DD MM YYYY Author(s) Changes 273@ 11 2 2015 100664 First version 274@ 275@***************************************************************************** 276 277 .global ih264_resi_trans_quant_chroma_4x4_a9 278ih264_resi_trans_quant_chroma_4x4_a9: 279 280 @R0 :pointer to src buffer 281 @R1 :pointer to pred buffer 282 @R2 :pointer to dst buffer 283 @R3 :Source stride 284 @STACk :pred stride 285 @ :scale matirx, 286 @ :threshold matrix 287 @ :qbits 288 @ :round factor 289 @ :nnz 290 @ :pu1_dc_alt_addr 291 push {r4-r12, lr} @push all the variables first 292 293 add r11, sp, #40 @decrement stack pointer,to accomodate two variables 294 ldmfd r11, {r4-r10} @load the strides into registers 295 296 @R0 :pointer to src buffer 297 @R1 :pointer to pred buffer 298 @R2 :pointer to dst buffer 299 @R3 :Source stride 300 @R4 :Pred stride 301 @R5 :scale matirx, 302 @R6 :threshold matrix 303 @R7 :qbits 304 @R8 :round factor 305 @R9 :nnz 306 vpush {d8-d15} 307 mov r11, #0 308 sub r7, r11, r7 @Negate the qbit value for usiing LSL 309 310 @------------Fucntion Loading done----------------; 311 312 vld2.u8 {d10, d11}, [r0], r3 @load first 8 pix src row 1 313 314 vld2.u8 {d11, d12}, [r1], r4 @load first 8 pix pred row 1 315 316 vld2.u8 {d28, d29}, [r0], r3 @load first 8 pix src row 2 317 318 vld2.u8 {d29, d30}, [r1], r4 @load first 8 pix pred row 2 319 320 vld2.u8 {d25, d26}, [r0], r3 @load first 8 pix src row 3 321 322 vld2.u8 {d26, d27}, [r1], r4 @load first 8 pix pred row 3 323 vsubl.u8 q0, d10, d11 @find residue row 1 324 325 vld2.u8 {d22, d23}, [r0], r3 @load first 8 pix src row 4 326 327 vld2.u8 {d23, d24}, [r1], r4 @load first 8 pix pred row 4 328 vsubl.u8 q1, d28, d29 @find residue row 2 329 330 vsubl.u8 q2, d25, d26 @find residue row 3 331 vsubl.u8 q3, d22, d23 @find residue row 4 332 333 vtrn.16 d0, d2 @T12 334 vtrn.16 d4, d6 @T23 335 vtrn.32 d0, d4 @T13 336 vtrn.32 d2, d6 @T14 337 338 vadd.s16 d8 , d0, d6 @x0 = x4+x7 339 vadd.s16 d9 , d2, d4 @x1 = x5+x6 340 vsub.s16 d10, d2, d4 @x2 = x5-x6 341 vsub.s16 d11, d0, d6 @x3 = x4-x7 342 343 vshl.s16 d12, d10, #1 @U_SHIFT(x2,1,shft) 344 vshl.s16 d13, d11, #1 @U_SHIFT(x3,1,shft) 345 346 vadd.s16 d14, d8, d9 @x4 = x0 + x1; 347 vsub.s16 d16, d8, d9 @x6 = x0 - x1; 348 vadd.s16 d15, d13, d10 @x5 = U_SHIFT(x3,1,shft) + x2; 349 vsub.s16 d17, d11, d12 @x7 = x3 - U_SHIFT(x2,1,shft); 350 351 @taking transpose again so as to make do vert transform 352 vtrn.16 d14, d15 @T12 353 vtrn.16 d16, d17 @T23 354 vtrn.32 d14, d16 @T13 355 vtrn.32 d15, d17 @T24 356 357 @let us do vertical transform 358 @same code as horiz 359 vadd.s16 d18, d14, d17 @x0 = x4+x7 360 vadd.s16 d19, d15, d16 @x1 = x5+x6 361 vsub.s16 d20, d15, d16 @x2 = x5-x6 362 vsub.s16 d21, d14, d17 @x3 = x4-x7 363 364 vshl.s16 d22, d20, #1 @U_SHIFT(x2,1,shft) 365 vshl.s16 d23, d21, #1 @U_SHIFT(x3,1,shft) 366 367 vdup.s32 q4, r8 @Load rounding value row 1 368 369 vadd.s16 d24, d18, d19 @x5 = x0 + x1; 370 vsub.s16 d26, d18, d19 @x7 = x0 - x1; 371 vadd.s16 d25, d23, d20 @x6 = U_SHIFT(x3,1,shft) + x2; 372 vsub.s16 d27, d21, d22 @x8 = x3 - U_SHIFT(x2,1,shft); 373 vdup.s32 q10, r7 @Load qbit values 374 375 vst1.s16 d24[0], [r10] @Store Unquantized dc value to dc alte address 376 377@core tranform is done for 4x8 block 1 378 vld1.s16 {q14-q15}, [r5] @load the scaling values 379 380 vabs.s16 q0, q12 @Abs val of row 1 blk 1 381 382 vabs.s16 q1, q13 @Abs val of row 2 blk 1 383 384 vmov.s32 q5, q4 @copy round fact for row 2 385 386 vmov.s32 q6, q4 @copy round fact for row 2 387 vclt.s16 q2, q12, #0 @Get the sign of row 1 blk 1 388 389 vmov.s32 q7, q4 @copy round fact for row 2 390 vclt.s16 q3, q13, #0 @Get the sign of row 2 blk 1 391 392 vmlal.s16 q4, d0, d28 @Multiply and add row 1 393 vmlal.s16 q5, d1, d29 @Multiply and add row 2 394 vmlal.s16 q6, d2, d30 @Multiply and add row 3 395 vmlal.s16 q7, d3, d31 @Multiply and add row 4 396 397 vshl.s32 q11, q4, q10 @Shift row 1 398 vshl.s32 q12, q5, q10 @Shift row 2 399 vshl.s32 q13, q6, q10 @Shift row 3 400 vshl.s32 q14, q7, q10 @Shift row 4 401 402 vmovn.s32 d30, q11 @Narrow row 1 403 vmovn.s32 d31, q12 @Narrow row 2 404 vmovn.s32 d0 , q13 @Narrow row 3 405 vmovn.s32 d1 , q14 @Narrow row 4 406 407 vneg.s16 q1, q15 @Get negative 408 vneg.s16 q4, q0 @Get negative 409 410 vceq.s16 q5, q15, #0 @I compare with zero row 1 and 2 blk 1 411 vceq.s16 q6, q0 , #0 @I compare with zero row 1 and 2 blk 1 412 413 vbsl.s16 q2, q1, q15 @Restore sign of row 1 and 2 414 vbsl.s16 q3, q4, q0 @Restore sign of row 3 and 4 415 416 vmovn.u16 d14, q5 @I Narrow the comparison for row 1 and 2 blk 1 417 vmovn.u16 d15, q6 @I Narrow the comparison for row 1 and 2 blk 2 418 419 vshr.u8 q8, q7, #7 @I Reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] 420 421 vpadd.u8 d18, d16, d17 @I pair add nnz 1 422 vpadd.u8 d20, d18, d19 @I Pair add nnz 2 423 vpadd.u8 d22, d20, d21 @I Pair add nnz 3 424 vpadd.u8 d24, d22, d23 @I Pair add nnz4 425 vst1.s16 {q2-q3}, [r2] @Store blk 426 427 vmov.u8 d25, #16 @I Get max nnz 428 vsub.u8 d26, d25, d24 @I invert current nnz 429 430 vst1.u8 d26[0], [r9] @I Write nnz 431 432 vpop {d8-d15} 433 pop {r4-r12, pc} 434 435 436 437@***************************************************************************** 438@* 439@* Function Name : ih264_hadamard_quant_4x4_a9 440@* Description : This function does forward hadamard transform and 441@* quantization for luma dc block 442@* 443@* Arguments : R0 :pointer to src buffer 444@ R1 :pointer to dst buffer 445@ R2 :pu2_scale_matrix 446@ R2 :pu2_threshold_matrix 447@ STACk : u4_qbits 448@ u4_round_factor 449@ pu1_nnz 450@ Values Returned : NONE 451@ 452@ Register Usage : 453@ Stack Usage : 0 bytes 454@ Cycles : Around 455@ Interruptiaility : Interruptable 456@ 457@ Known Limitations 458@ \Assumptions : 459@ 460@ Revision History : 461@ DD MM YYYY Author(s) Changes 462@ 20 2 2015 100633 First version 463@ 464@***************************************************************************** 465@ih264_hadamard_quant_4x4_a9(WORD16 *pi2_src, WORD16 *pi2_dst, 466@ const UWORD16 *pu2_scale_matrix, 467@ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, 468@ UWORD32 u4_round_factor,UWORD8 *pu1_nnz 469@ ) 470 .global ih264_hadamard_quant_4x4_a9 471ih264_hadamard_quant_4x4_a9: 472 473@Registert usage 474@ r0 : src 475@ r1 : dst 476@ r2 : *pu2_scale_matrix 477@ r3 : *pu2_threshold_matrix 478 479 vld4.s16 {d0, d1, d2, d3}, [r0]! @Load 4x4 block 480 vpush {d8-d15} 481 482 vld1.u16 d30[0], [r2] @load pu2_scale_matrix[0] 483 484 vaddl.s16 q3, d0, d3 @x0 = x4 + x7; 485 vaddl.s16 q4, d1, d2 @x1 = x5 + x6; 486 vsubl.s16 q5, d1, d2 @x2 = x5 - x6; 487 vsubl.s16 q6, d0, d3 @x3 = x4 - x7; 488 489 vdup.u16 d30, d30[0] @pu2_scale_matrix[0] 490 491 vadd.s32 q7, q3, q4 @pi2_dst[0] = x0 + x1; 492 vadd.s32 q8, q6, q5 @pi2_dst[1] = x3 + x2; 493 add r3, sp, #68 @Get address of u4_round_factor 494 vsub.s32 q9, q3, q4 @pi2_dst[2] = x0 - x1; 495 vsub.s32 q10, q6, q5 @pi2_dst[3] = x3 - x2; 496 497 vtrn.s32 q7, q8 @transpose 4x4 block 498 vtrn.s32 q9, q10 499 vld1.s32 d0[0], [r3] @load u4_round_factor 500 vswp d15, d18 501 vswp d17, d20 502 503 add r3, sp, #64 @Get address of u4_qbits 504 vadd.s32 q11, q7, q10 @x0 = x4 + x7; 505 vadd.s32 q12, q8, q9 @x1 = x5 + x6; 506 vld1.s32 d31[0], [r3] @load u4_qbits 507 vsub.s32 q13, q8, q9 @x2 = x5 - x6; 508 vsub.s32 q14, q7, q10 @x3 = x4 - x7; 509 510 vdup.s32 q7, d0[0] @u4_round_factor 511 512 vadd.s32 q0, q11, q12 @(x0 + x1) 513 vadd.s32 q1, q14, q13 @(x3 + x2) 514 vsub.s32 q2, q11, q12 @(x0 - x1) 515 vsub.s32 q3, q14, q13 @(x3 - x2) 516 517 vdup.s32 q11, d31[0] @u4_round_factor 518 519 vshrn.s32 d0, q0, #1 @i4_value = (x0 + x1) >> 1; 520 vshrn.s32 d1, q1, #1 @i4_value = (x3 + x2) >> 1; 521 vshrn.s32 d2, q2, #1 @i4_value = (x0 - x1) >> 1; 522 vshrn.s32 d3, q3, #1 @i4_value = (x3 - x2) >> 1; 523 524 vabs.s16 q5, q0 525 vabs.s16 q6, q1 526 527 vmov.s32 q8, q7 @Get the round fact 528 vmov.s32 q9, q7 529 vmov.s32 q10, q7 530 531 vclt.s16 q3, q0, #0 @get the sign row 1,2 532 vclt.s16 q4, q1, #0 533 534 vneg.s32 q11, q11 @-u4_round_factor 535 536 vmlal.u16 q7, d10, d30 537 vmlal.u16 q8, d11, d30 538 vmlal.u16 q9, d12, d30 539 vmlal.u16 q10, d13, d30 540 541 vshl.u32 q7, q7, q11 542 vshl.u32 q8, q8, q11 543 vshl.u32 q9, q9, q11 544 vshl.u32 q10, q10, q11 545 546 vqmovn.u32 d22, q7 547 vqmovn.u32 d23, q8 548 vqmovn.u32 d24, q9 549 vqmovn.u32 d25, q10 550 551 vneg.s16 q13, q11 552 vneg.s16 q14, q12 553 554 vbsl.s16 q3, q13, q11 555 vbsl.s16 q4, q14, q12 556 557 vceq.s16 q5, q11, #0 558 vceq.s16 q6, q12, #0 559 560 vst1.s16 {q3}, [r1]! 561 562 vshrn.u16 d14, q5, #8 563 vshrn.u16 d15, q6, #8 564 565 ldr r3, [sp, #72] @Load *pu1_nnz 566 567 vshr.u8 q7, q7, #7 568 569 vst1.s16 {q4}, [r1]! 570 571 vadd.u8 d16, d14, d15 572 vmov.u8 d20, #16 573 vpadd.u8 d17, d16, d16 574 vpadd.u8 d18, d17, d17 575 vpadd.u8 d19, d18, d18 576 vsub.u8 d20, d20, d19 577 vst1.u8 d20[0], [r3] 578 579 vpop {d8-d15} 580 bx lr 581 582 583 584 585@***************************************************************************** 586@* 587@* Function Name : ih264_hadamard_quant_2x2_uv_a9 588@* Description : This function does forward hadamard transform and 589@* quantization for dc block of chroma for both planes 590@* 591@* Arguments : R0 :pointer to src buffer 592@ R1 :pointer to dst buffer 593@ R2 :pu2_scale_matrix 594@ R2 :pu2_threshold_matrix 595@ STACk : u4_qbits 596@ u4_round_factor 597@ pu1_nnz 598@ Values Returned : NONE 599@ 600@ Register Usage : 601@ Stack Usage : 0 bytes 602@ Cycles : Around 603@ Interruptiaility : Interruptable 604@ 605@ Known Limitations 606@ \Assumptions : 607@ 608@ Revision History : 609@ DD MM YYYY Author(s) Changes 610@ 20 2 2015 100633 First version 611@ 612@***************************************************************************** 613@ ih264_hadamard_quant_2x2_uv_a9(WORD16 *pi2_src, WORD16 *pi2_dst, 614@ const UWORD16 *pu2_scale_matrix, 615@ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, 616@ UWORD32 u4_round_factor,UWORD8 *pu1_nnz 617@ ) 618 619 .global ih264_hadamard_quant_2x2_uv_a9 620ih264_hadamard_quant_2x2_uv_a9: 621 622 vpush {d8-d15} 623 vld2.s16 {d0-d1}, [r0] @load src 624 625 add r3, sp, #68 @Get address of u4_round_factor 626 627 vaddl.s16 q3, d0, d1 @x0 = x4 + x5;, x2 = x6 + x7; 628 vld1.u16 d30[0], [r2] @load pu2_scale_matrix[0] 629 vsubl.s16 q4, d0, d1 @x1 = x4 - x5; x3 = x6 - x7; 630 631 add r0, sp, #64 @Get affress of u4_qbits 632 vld1.s32 d28[0], [r3] @load u4_round_factor 633 vtrn.s32 q3, q4 @q1 -> x0 x1, q2 -> x2 x3 634 635 vadd.s32 q0, q3, q4 @ (x0 + x2) (x1 + x3) (y0 + y2); (y1 + y3); 636 vld1.s32 d24[0], [r0] @load u4_qbits 637 vsub.s32 q1, q3, q4 @ (x0 - x2) (x1 - x3) (y0 - y2); (y1 - y3); 638 639 vdup.u16 d30, d30[0] @pu2_scale_matrix 640 641 vabs.s32 q2, q0 642 vabs.s32 q3, q1 643 644 vdup.s32 q14, d28[0] @u4_round_factor 645 646 vmovl.u16 q15, d30 @pu2_scale_matrix 647 648 vclt.s32 q4, q0, #0 @get the sign row 1,2 649 vdup.s32 q12, d24[0] @u4_round_factor 650 vclt.s32 q5, q1, #0 651 652 vqmovn.u32 d8, q4 653 vqmovn.s32 d9, q5 654 655 vmov.s32 q13, q14 @Get the round fact 656 vneg.s32 q12, q12 @-u4_round_factor 657 658 vmla.u32 q13, q2, q15 659 vmla.u32 q14, q3, q15 660 661 vshl.u32 q13, q13, q12 @>>qbit 662 vshl.u32 q14, q14, q12 @>>qbit 663 664 vqmovn.u32 d10, q13 665 vqmovn.u32 d11, q14 666 667 vneg.s16 q6, q5 668 669 vbsl.s16 q4, q6, q5 @*sign 670 671 vtrn.s32 d8, d9 672 673 vceq.s16 q7, q4, #0 @Compute nnz 674 675 vshrn.u16 d14, q7, #8 @reduce nnz comparison to 1 bit 676 677 ldr r3, [sp, #72] @Load *pu1_nnz 678 vshr.u8 d14, d14, #7 @reduce nnz comparison to 1 bit 679 vmov.u8 d20, #4 @Since we add zeros, we need to subtract from 4 to get nnz 680 vpadd.u8 d17, d14, d14 @Sum up nnz 681 682 vst1.s16 {q4}, [r1]! @Store the block 683 684 vpadd.u8 d17, d17, d17 @Sum up nnz 685 vsub.u8 d20, d20, d17 @4- numzeros 686 vst1.u16 d20[0], [r3] @store nnz 687 688 vpop {d8-d15} 689 bx lr 690 691 692 693 694 695