1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_intra_pred_filters_dc.s 22@* 23@* @brief 24@* contains function definitions for intra prediction dc filtering. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* akshaya mukund 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* luma intraprediction filter for dc input 45@* 46@* @par description: 47@* 48@* @param[in] pu1_ref 49@* uword8 pointer to the source 50@* 51@* @param[out] pu1_dst 52@* uword8 pointer to the destination 53@* 54@* @param[in] src_strd 55@* integer source stride 56@* 57@* @param[in] dst_strd 58@* integer destination stride 59@* 60@* @param[in] pi1_coeff 61@* word8 pointer to the planar coefficients 62@* 63@* @param[in] nt 64@* size of tranform block 65@* 66@* @param[in] mode 67@* type of filtering 68@* 69@* @returns 70@* 71@* @remarks 72@* none 73@* 74@******************************************************************************* 75@*/ 76 77@void ihevc_intra_pred_luma_dc(uword8 *pu1_ref, 78@ word32 src_strd, 79@ uword8 *pu1_dst, 80@ word32 dst_strd, 81@ word32 nt, 82@ word32 mode) 83@ 84@**************variables vs registers***************************************** 85@r0 => *pu1_ref 86@r1 => src_strd 87@r2 => *pu1_dst 88@r3 => dst_strd 89 90@stack contents from #40 91@ nt 92@ mode 93@ pi1_coeff 94 95.text 96.align 4 97 98 99 100 101.globl ihevc_intra_pred_luma_dc_a9q 102 103.type ihevc_intra_pred_luma_dc_a9q, %function 104 105ihevc_intra_pred_luma_dc_a9q: 106 107 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 108 109 ldr r4,[sp,#40] @loads nt 110 111@********** testing 112 @mov r6, #128 113 @b prologue_cpy_32 114@********** testing 115 116 mov r11, #2 @mov #2 to r11 (to be used to add to 2dc_val & 3dc_val) 117 mov r9, #0 118 vmov d17, r11, r9 119 120 clz r5, r4 121 122 add r6, r0, r4 @&src[nt] 123 rsb r5, r5, #32 @log2nt 124 add r7, r0, r4, lsl #1 @&src[2nt] 125 126 add r8, r7, #1 @&src[2nt+1] 127 mvn r5, r5 128 add r5, r5, #1 129 vdup.32 d8, r5 130 131 ldrb r14, [r8] 132 vshl.i64 d8, d8, #32 133 134 sub r9, r7, #1 @&src[2nt-1] 135 vshr.s64 d8, d8, #32 136 137 mov r7, r8 @r7 also stores 2nt+1 138 139 ldrb r12, [r9] 140 add r14, r14, r12 @src[2nt+1] + src[2nt-1] 141 add r14, r14, r11 @src[2nt+1] + src[2nt-1] + 2 142 143 cmp r4, #4 144 beq dc_4 145 146 mov r10, r4 @nt 147 148add_loop: 149 vld1.s8 d0, [r6]! @load from src[nt] 150 mov r5, #0 @ 151 vld1.s8 d1, [r8]! @load from src[2nt+1] 152 153 vpaddl.u8 d2, d0 154 155 vmov d6, r4, r5 @store nt to accumulate 156 vpaddl.u8 d3, d1 157 158 vld1.s8 d0, [r6]! @load from src[nt] (extra load for 8) 159 160 vld1.s8 d1, [r8]! @load from src[2nt+1] (extra load for 8) 161 vadd.u16 d4, d2, d3 162 163 164 vpaddl.u16 d5, d4 165 166 167 vpadal.u32 d6, d5 @accumulate all inp into d6 (end for nt==8) 168 169 subs r10, #8 170 beq epil_add_loop 171 172core_loop_add: 173 vpaddl.u8 d2, d0 174 subs r10, #8 175 vpaddl.u8 d3, d1 176 177 178 179 vadd.u16 d4, d2, d3 180 vld1.s8 d0, [r6]! @load from src[nt] (extra load for 16) 181 182 vpaddl.u16 d5, d4 183 vld1.s8 d1, [r8]! @load from src[2nt+1] (extra load for 16) 184 185 vpadal.u32 d6, d5 @accumulate all inp into d6 186 bne core_loop_add 187 188epil_add_loop: 189 190 vshl.s64 d9, d6, d8 @(dc_val) shr by log2nt+1 191 cmp r4, #32 192 193 vmov d28, r14, r5 @src[2nt+1]+2+src[2nt-1] moved to d28 194 moveq r6, #128 195 196 vdup.8 d16, d9[0] @dc_val 197 vshl.s64 d13, d9, #1 @2*dc 198 199 beq prologue_cpy_32 200 201 vadd.i64 d14, d13, d28 @src[2nt+1]+2+src[2nt-1]+2dc_val 202 movne r6, #0 @nt 203 204 vshr.u16 d15, d14, #2 @final dst[0]'s value in d15[0] 205 movne r10, r4 206 207 vadd.i64 d11, d13, d9 @3*dc 208 sub r12, r3, r3, lsl #3 @-7*strd 209 210 vadd.i64 d11, d11, d17 @3*dc + 2 211 add r12, r12, #8 @offset after one 8x8 block (-7*strd + 8) 212 213 vdup.16 q12, d11[0] @3*dc + 2 (moved to all lanes) 214 sub r0, r3, r4 @strd - nt 215 216prologue_col: 217 @0th column and 0-7 rows done here 218 @r8 and r9 (2nt+1+col 2nt-1-row) 219 220 mov r8, r7 @&src[2nt+1] 221 222 add r0, r0, #8 @strd - nt + 8 223 vld1.s8 d0, [r8]! @col 1::7 load (prol) 224 sub r9, r9, #7 @&src[2nt-1-row] 225 226 vld1.s8 d1, [r9] @row 7::1 (0 also) load (prol) 227 sub r9, r9, #8 228 229 vmovl.u8 q10, d0 230 231 vld1.s8 d6, [r8] @col 8::15 load (prol extra) 232 vadd.i16 q10, q10, q12 @col 1::7 add 3dc+2 (prol) 233 234 vmovl.u8 q11, d1 235 vqshrun.s16 d2, q10, #2 @columns shr2 movn (prol) 236 237 vmovl.u8 q13, d6 238 vadd.i16 q11, q11, q12 @row 1::7 add 3dc+2 (prol) 239 240 vmov.i64 d19, #0x00000000000000ff @ 241 vqshrun.s16 d3, q11, #2 @rows shr2 movn (prol) 242 243 vbsl d19, d15, d2 @first row with dst[0] 244 vadd.i16 q13, q13, q12 @col 8::15 add 3dc+2 (prol extra) 245 246 vrev64.8 d3, d3 247 248 vst1.8 d19, [r2], r3 @store row 0 (prol) 249 vshr.s64 d3, d3, #8 @row 0 shift (prol) (first value to be ignored) 250 251 vmov.i64 d20, #0x00000000000000ff @byte mask row 1 (prol) 252 253loop_again_col_row: 254 255 vbsl d20, d3, d16 @row 1 (prol) 256 257 vmov.i64 d21, #0x00000000000000ff @byte mask row 2 (prol) 258 vshr.s64 d3, d3, #8 @row 1 shift (prol) 259 260 vst1.8 d20, [r2], r3 @store row 1 (prol) 261 vqshrun.s16 d4, q13, #2 @columns shr2 movn (prol extra) 262 263 264 vbsl d21, d3, d16 @row 2 (prol) 265 266 vmov.i64 d20, #0x00000000000000ff @byte mask row 3 (prol) 267 vshr.s64 d3, d3, #8 @row 2 shift (prol) 268 269 vst1.8 d21, [r2], r3 @store row 2 (prol) 270 271 272 vbsl d20, d3, d16 @row 3 (prol) 273 274 vmov.i64 d21, #0x00000000000000ff @byte mask row 4 (prol) 275 vshr.s64 d3, d3, #8 @row 3 shift (prol) 276 277 vst1.8 d20, [r2], r3 @store row 3 (prol) 278 279 280 vbsl d21, d3, d16 @row 4 (prol) 281 282 vmov.i64 d20, #0x00000000000000ff @byte mask row 5 (prol) 283 vshr.s64 d3, d3, #8 @row 4 shift (prol) 284 285 vst1.8 d21, [r2], r3 @store row 4 (prol) 286 287 288 vbsl d20, d3, d16 @row 5 (prol) 289 290 vmov.i64 d21, #0x00000000000000ff @byte mask row 6 (prol) 291 vshr.s64 d3, d3, #8 @row 5 shift (prol) 292 293 vst1.8 d20, [r2], r3 @store row 5 (prol) 294 295 vld1.s8 d1, [r9] @row 8::15 load (prol extra) 296 297 vbsl d21, d3, d16 @row 6 (prol) 298 299 vmovl.u8 q11, d1 300 301 vmov.i64 d20, #0x00000000000000ff @byte mask row 7 (prol) 302 vshr.s64 d3, d3, #8 @row 6 shift (prol) 303 304 vst1.8 d21, [r2], r3 @store row 6 (prol) 305 306 vbsl d20, d3, d16 @row 7 (prol) 307 vadd.i16 q11, q11, q12 @row 8::15 add 3dc+2 (prol extra) 308 309 vshr.s64 d3, d3, #8 @row 7 shift (prol) 310 vst1.8 d20, [r2], r12 @store row 7 (prol) 311 312 subs r10, r10, #8 @counter for cols 313 314 beq end_func 315 blt copy_16 316 317 318 vmov.i64 d20, #0x00000000000000ff @byte mask row 9 (prol) 319 vqshrun.s16 d3, q11, #2 @rows shr2 movn (prol) 320 321 vrev64.8 d3, d3 322 323 vst1.8 d4, [r2], r3 @store 2nd col (for 16x16) 324 325 vst1.8 d16, [r2], r3 326 vst1.8 d16, [r2], r3 327 vst1.8 d16, [r2], r3 328 vst1.8 d16, [r2], r3 329 vst1.8 d16, [r2], r3 330 vst1.8 d16, [r2], r3 331 vst1.8 d16, [r2], r0 @go to next row for 16 332 333 334 vbsl d20, d3, d16 @row 9 (prol) 335 subs r10, r10, #8 336 337 vst1.8 d20, [r2], r3 @store row 9 (prol) 338 vshr.s64 d3, d3, #8 @row 9 shift (prol) 339 340 vmov.i64 d20, #0x00000000000000ff @byte mask row 9 (prol) 341 342 b loop_again_col_row 343 344 345copy_16: 346 vst1.8 d16, [r2], r3 347 vst1.8 d16, [r2], r3 348 vst1.8 d16, [r2], r3 349 vst1.8 d16, [r2], r3 350 vst1.8 d16, [r2], r3 351 vst1.8 d16, [r2], r3 352 vst1.8 d16, [r2], r3 353 vst1.8 d16, [r2] 354 355 b end_func 356 357prologue_cpy_32: 358 mov r9, #128 359 @sub r7, r3, #-24 360 add r5, r2, r3 361 add r8, r5, r3 362 add r10, r8, r3 363 vdup.8 q10, d16[0] 364 lsl r6, r3, #2 365 add r6, r6, #0xfffffff0 366 367 vst1.8 {d20,d21}, [r2]! 368 vst1.8 {d20,d21}, [r5]! 369 vst1.8 {d20,d21}, [r8]! 370 vst1.8 {d20,d21}, [r10]! 371 372 vst1.8 {d20,d21}, [r2], r6 373 vst1.8 {d20,d21}, [r5], r6 374 vst1.8 {d20,d21}, [r8], r6 375 vst1.8 {d20,d21}, [r10], r6 376 377 sub r9, r9, #32 @32x32 prol/epil counter dec 378 379kernel_copy: 380 vst1.8 {d20,d21}, [r2]! 381 vst1.8 {d20,d21}, [r5]! 382 vst1.8 {d20,d21}, [r8]! 383 vst1.8 {d20,d21}, [r10]! 384 385 vst1.8 {d20,d21}, [r2], r6 386 vst1.8 {d20,d21}, [r5], r6 387 vst1.8 {d20,d21}, [r8], r6 388 vst1.8 {d20,d21}, [r10], r6 389 390 subs r9, r9, #32 391 392 vst1.8 {d20,d21}, [r2]! 393 vst1.8 {d20,d21}, [r5]! 394 vst1.8 {d20,d21}, [r8]! 395 vst1.8 {d20,d21}, [r10]! 396 397 vst1.8 {d20,d21}, [r2], r6 398 vst1.8 {d20,d21}, [r5], r6 399 vst1.8 {d20,d21}, [r8], r6 400 vst1.8 {d20,d21}, [r10], r6 401 402 bne kernel_copy 403 404epilogue_copy: 405 vst1.8 {d20,d21}, [r2]! 406 vst1.8 {d20,d21}, [r5]! 407 vst1.8 {d20,d21}, [r8]! 408 vst1.8 {d20,d21}, [r10]! 409 410 vst1.8 {d20,d21}, [r2] 411 vst1.8 {d20,d21}, [r5] 412 vst1.8 {d20,d21}, [r8] 413 vst1.8 {d20,d21}, [r10] 414 415 b end_func 416 417 418dc_4: 419 vld1.s8 d0, [r6]! @load from src[nt] 420 vld1.s8 d1, [r8]! @load from src[2nt+1] 421 422 vpaddl.u8 d2, d0 423 mov r5, #0 @ 424 vmov d6, r4, r5 @store nt to accumulate 425 vpaddl.u8 d3, d1 426 427 vadd.u16 d4, d2, d3 428 429 430 vpaddl.u16 d5, d4 431 vmov.i64 d30, #0x00000000ffffffff 432 433 vand d5, d5, d30 434 435 vmov d28, r14, r5 @src[2nt+1]+2+src[2nt-1] moved to d28 436 vadd.i64 d6, d6, d5 @accumulate all inp into d6 (end for nt==8) 437 438 vshl.s64 d9, d6, d8 @(dc_val) shr by log2nt+1 439 mov r8, r7 @&src[2nt+1] 440 441 vshl.s64 d13, d9, #1 @2*dc 442 sub r9, r9, #3 @&src[2nt-1-row] 443 444 vdup.8 d16, d9[0] @dc_val 445 vadd.i64 d14, d13, d28 @src[2nt+1]+2+src[2nt-1]+2dc_val 446 447 vshr.u16 d15, d14, #2 @final dst[0]'s value in d15[0] 448 sub r12, r3, r3, lsl #2 @-3*strd 449 vadd.i64 d11, d13, d9 @3*dc 450 451 vadd.i64 d11, d11, d17 @3*dc + 2 452 add r12, r12, #4 @offset after one 4x4 block (-3*strd + 4) 453 454 vdup.16 q12, d11[0] @3*dc + 2 (moved to all lanes) 455 sub r0, r3, r4 @strd - nt 456 457 458 vld1.s8 d0, [r8] @col 1::3 load (prol) 459 vld1.s8 d1, [r9] @row 3::1 (0 also) load (prol) 460 461 vmovl.u8 q10, d0 462 463 vmovl.u8 q11, d1 464 vadd.i16 q10, q10, q12 @col 1::7 add 3dc+2 (prol) 465 466 vadd.i16 q11, q11, q12 @row 1::7 add 3dc+2 (prol) 467 468 vmov.i64 d19, #0x00000000000000ff @ 469 vqshrun.s16 d2, q10, #2 @columns shr2 movn (prol) 470 471 vmov.i64 d20, #0x00000000000000ff @byte mask row 1 (prol) 472 vqshrun.s16 d3, q11, #2 @rows shr2 movn (prol) 473 474 475 vbsl d19, d15, d2 @first row with dst[0] 476 477 vrev64.8 d3, d3 478 479 vst1.32 d19[0], [r2], r3 @store row 0 (prol) 480 vshr.s64 d3, d3, #40 @row 0 shift (prol) (first value to be ignored) 481 482 vmov.i64 d21, #0x00000000000000ff @byte mask row 2 (prol) 483 484 vbsl d20, d3, d16 @row 1 (prol) 485 vshr.s64 d3, d3, #8 @row 1 shift (prol) 486 487 vst1.32 d20[0], [r2], r3 @store row 1 (prol) 488 489 vbsl d21, d3, d16 @row 2 (prol) 490 491 vmov.i64 d20, #0x00000000000000ff @byte mask row 3 (prol) 492 493 vshr.s64 d3, d3, #8 @row 2 shift (prol) 494 vst1.32 d21[0], [r2], r3 @store row 2 (prol) 495 496 vbsl d20, d3, d16 @row 3 (prol) 497 vst1.32 d20[0], [r2] @store row 3 (prol) 498 499epilogue_end: 500end_func: 501 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 502 503 504 505 506 507 508 509