1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@****************************************************************************** 20@* @file 21@* ihevc_inter_pred_luma_horz.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* parthiban v 31@* 32@* @par list of functions: 33@* 34@* - ihevc_inter_pred_luma_horz() 35@* 36@* @remarks 37@* none 38@* 39@******************************************************************************* 40@*/ 41 42@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */ 43@/* include reconstruction */ 44@ 45 46@/** 47@******************************************************************************* 48@* 49@* @brief 50@* interprediction luma filter for vertical input 51@* 52@* @par description: 53@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 54@* the elements pointed by 'pu1_src' and writes to the location pointed by 55@* 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits 56@* assumptions : the function is optimized considering the fact width is 57@* multiple of 4 or 8. and height as multiple of 2. 58@* 59@* @param[in] pu1_src 60@* uword8 pointer to the source 61@* 62@* @param[out] pu1_dst 63@* uword8 pointer to the destination 64@* 65@* @param[in] src_strd 66@* integer source stride 67@* 68@* @param[in] dst_strd 69@* integer destination stride 70@* 71@* @param[in] pi1_coeff 72@* word8 pointer to the filter coefficients 73@* 74@* @param[in] ht 75@* integer height of the array 76@* 77@* @param[in] wd 78@* integer width of the array 79@* 80@* @returns 81@* 82@* @remarks 83@* none 84@* 85@******************************************************************************* 86@*/ 87 88@void ihevc_inter_pred_luma_horz ( 89@ uword8 *pu1_src, 90@ uword8 *pu1_dst, 91@ word32 src_strd, 92@ word32 dst_strd, 93@ word8 *pi1_coeff, 94@ word32 ht, 95@ word32 wd ) 96 97@**************variables vs registers***************************************** 98@ r0 => *pu1_src 99@ r1 => *pu1_dst 100@ r2 => src_strd 101@ r3 => dst_strd 102@ r4 => *pi1_coeff 103@ r5 => ht 104@ r6 => wd 105 106.text 107.align 4 108 109 110 111 112.globl ihevc_inter_pred_luma_horz_a9q 113 114.type ihevc_inter_pred_luma_horz_a9q, %function 115 116ihevc_inter_pred_luma_horz_a9q: 117 118 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 119 @str r1,[sp,#-4] 120 @ mov r7,#8192 121start_loop_count: 122 @ ldr r1,[sp,#-4] 123 124 125 ldr r4,[sp,#40] @loads pi1_coeff 126 ldr r8,[sp,#44] @loads ht 127 ldr r10,[sp,#48] @loads wd 128 129 vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) 130 mov r11,#1 131 subs r14,r8,#0 @checks for ht == 0 132 133 vabs.s8 d2,d0 @vabs_s8(coeff) 134 135 @ble end_loops 136 137 138 vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0) 139 sub r12,r0,#3 @pu1_src - 3 140 vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1) 141 add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd 142 vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2) 143 rsb r9,r10,r2,lsl #1 @2*src_strd - wd 144 vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3) 145 rsb r8,r10,r3,lsl #1 @2*dst_strd - wd 146 vdup.8 d28,d2[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4) 147 148 vdup.8 d29,d2[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5) 149 @ tst r10,#7 @checks wd for multiples 150 vdup.8 d30,d2[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6) 151 vdup.8 d31,d2[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7) 152 153 mov r7,r1 154 155 cmp r10,#4 156 ble outer_loop_4 157 158 cmp r10,#24 159 moveq r10,#16 160 addeq r8,#8 161 addeq r9,#8 162 163 cmp r10,#16 164 bge outer_loop_16 165 166 cmp r10,#12 167 addeq r8,#4 168 addeq r9,#4 169 b outer_loop_8 170 171 172outer_loop8_residual: 173 sub r12,r0,#3 @pu1_src - 3 174 mov r1,r7 175 mov r14,#32 176 add r1,#16 177 add r12,#16 178 mov r10,#8 179 add r8,#8 180 add r9,#8 181 182outer_loop_8: 183 184 add r6,r1,r3 @pu1_dst + dst_strd 185 add r4,r12,r2 @pu1_src + src_strd 186 subs r5,r10,#0 @checks wd 187 188 ble end_inner_loop_8 189 190inner_loop_8: 191 vld1.u32 {d0},[r12],r11 @vector load pu1_src 192 vld1.u32 {d1},[r12],r11 193 vld1.u32 {d2},[r12],r11 194 vld1.u32 {d3},[r12],r11 195 196 197 198 199 200 @ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 201 @ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] 202 @ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 203 @ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] 204 @ vext.u8 d6,d0,d1,#6 @vector extract of src [0_6] 205 @ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] 206 @ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] 207 @ vext.u8 d14,d12,d13,#2 208 209 @vext.u8 d15,d12,d13,#3 @vector extract of src[0_3] 210 @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 211 @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5] 212 @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 213 @vext.u8 d19,d12,d13,#7 @vector extract of src[0_7] 214 @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1] 215 vld1.u32 {d4},[r12],r11 216 vmull.u8 q4,d1,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 217 vld1.u32 {d5},[r12],r11 218 vmlal.u8 q4,d3,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 219 vld1.u32 {d6},[r12],r11 220 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 221 vld1.u32 {d7},[r12],r11 222 vmlsl.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 223 vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd 224 vmlal.u8 q4,d4,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 225 vld1.u32 {d13},[r4],r11 226 vmlsl.u8 q4,d5,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 227 vld1.u32 {d14},[r4],r11 228 vmlal.u8 q4,d6,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 229 vld1.u32 {d15},[r4],r11 230 vmlsl.u8 q4,d7,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 231 vld1.u32 {d16},[r4],r11 @vector load pu1_src + src_strd 232 233 vmull.u8 q5,d15,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 234 vld1.u32 {d17},[r4],r11 235 vmlsl.u8 q5,d14,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 236 vld1.u32 {d18},[r4],r11 237 vmlal.u8 q5,d16,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 238 vld1.u32 {d19},[r4],r11 @vector load pu1_src + src_strd 239 vmlsl.u8 q5,d17,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 240 vqrshrun.s16 d20,q4,#6 @right shift and saturating narrow result 1 241 vmlal.u8 q5,d18,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 242 vmlsl.u8 q5,d19,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 243 vst1.8 {d20},[r1]! @store the result pu1_dst 244 vmlsl.u8 q5,d12,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 245 vmlal.u8 q5,d13,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 246 247 248 249 vqrshrun.s16 d8,q5,#6 @right shift and saturating narrow result 2 250 subs r5,r5,#8 @decrement the wd loop 251 vst1.8 {d8},[r6]! @store the result pu1_dst 252 cmp r5,#4 253 bgt inner_loop_8 254 255end_inner_loop_8: 256 subs r14,r14,#2 @decrement the ht loop 257 add r12,r12,r9 @increment the src pointer by 2*src_strd-wd 258 add r1,r1,r8 @increment the dst pointer by 2*dst_strd-wd 259 bgt outer_loop_8 260 261 262 263 264 265 ldr r10,[sp,#48] @loads wd 266 cmp r10,#12 267 268 beq outer_loop4_residual 269 270 271end_loops: 272 273 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 274 275 276 277 278 279 280outer_loop_16: 281 str r0, [sp, #-4]! 282 str r7, [sp, #-4]! 283 284 add r6,r1,r3 @pu1_dst + dst_strd 285 add r4,r12,r2 @pu1_src + src_strd 286 and r0, r12, #31 287 sub r5,r10,#0 @checks wd 288 @ble end_loops1 289 pld [r12, r2, lsl #1] 290 vld1.u32 {q0},[r12],r11 @vector load pu1_src 291 pld [r4, r2, lsl #1] 292 vld1.u32 {q1},[r12],r11 293 vld1.u32 {q2},[r12],r11 294 vld1.u32 {q3},[r12],r11 295 vld1.u32 {q6},[r12],r11 296 vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 297 vld1.u32 {q7},[r12],r11 298 vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 299 vld1.u32 {q8},[r12],r11 300 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 301 vld1.u32 {q9},[r12],r11 302 vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 303 vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 304 vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 305 vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 306 vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 307 308 309inner_loop_16: 310 311 312 subs r5,r5,#16 313 vmull.u8 q10,d3,d25 314 315 add r12,#8 316 vmlsl.u8 q10,d1,d24 317 318 subeq r14,r14,#2 319 vmlal.u8 q10,d7,d27 320 321 vld1.u32 {q0},[r4],r11 @vector load pu1_src 322 vmlsl.u8 q10,d5,d26 323 324 vld1.u32 {q1},[r4],r11 325 vmlal.u8 q10,d13,d28 326 327 vld1.u32 {q2},[r4],r11 328 vmlal.u8 q10,d17,d30 329 330 vld1.u32 {q3},[r4],r11 331 vmlsl.u8 q10,d15,d29 332 333 vld1.u32 {q6},[r4],r11 334 vmlsl.u8 q10,d19,d31 335 336 vld1.u32 {q7},[r4],r11 337 vqrshrun.s16 d8,q4,#6 @right shift and saturating narrow result 1 338 339 vld1.u32 {q8},[r4],r11 340 vmull.u8 q5,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 341 342 vld1.u32 {q9},[r4],r11 343 vmlal.u8 q5,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 344 345 add r4,#8 346 vmlsl.u8 q5,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 347 348 addeq r12,r12,r9 @increment the src pointer by 2*src_strd-wd 349 vmlsl.u8 q5,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 350 351 addeq r4,r12,r2 @pu1_src + src_strd 352 vqrshrun.s16 d9,q10,#6 353 354 vmlal.u8 q5,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 355 356@ and r7, r12, #31 357 vmlsl.u8 q5,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 358 359 vmlal.u8 q5,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 360 361 vmlsl.u8 q5,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 362 363 vmull.u8 q11,d3,d25 364 365 vmlsl.u8 q11,d1,d24 366 367 vst1.8 {q4},[r1]! @store the result pu1_dst 368 vmlal.u8 q11,d7,d27 369 370 addeq r1,r1,r8 371 vqrshrun.s16 d10,q5,#6 @right shift and saturating narrow result 2 372 373@ cmp r7, r0 374 vmlsl.u8 q11,d5,d26 375 376 pld [r12, r2, lsl #2] 377 vmlal.u8 q11,d13,d28 378 379 pld [r4, r2, lsl #2] 380 vmlal.u8 q11,d17,d30 381 382@ mov r0, r7 383 vmlsl.u8 q11,d15,d29 384 385 cmp r14,#0 386 vmlsl.u8 q11,d19,d31 387 388 beq epilog_16 389 vld1.u32 {q0},[r12],r11 @vector load pu1_src 390 vld1.u32 {q1},[r12],r11 391 vld1.u32 {q2},[r12],r11 392 vld1.u32 {q3},[r12],r11 393 vld1.u32 {q6},[r12],r11 394 vqrshrun.s16 d11,q11,#6 395 vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 396 vld1.u32 {q7},[r12],r11 397 vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 398 vld1.u32 {q8},[r12],r11 399 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 400 vld1.u32 {q9},[r12],r11 401 vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 402 vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 403 cmp r5,#0 404 vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 405 moveq r5,r10 406 vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 407 vst1.8 {q5},[r6]! @store the result pu1_dst 408 vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 409 addeq r6,r1,r3 @pu1_dst + dst_strd 410 b inner_loop_16 411 412 413epilog_16: 414 vqrshrun.s16 d11,q11,#6 415 vst1.8 {q5},[r6]! @store the result pu1_dst 416 417 ldr r7, [sp], #4 418 ldr r0, [sp], #4 419 ldr r10,[sp,#48] 420 cmp r10,#24 421 422 beq outer_loop8_residual 423 424 425 426end_loops1: 427 428 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 429 430 431 432 433 434 435 436 437outer_loop4_residual: 438 sub r12,r0,#3 @pu1_src - 3 439 mov r1,r7 440 add r1,#8 441 mov r10,#4 442 add r12,#8 443 mov r14,#16 444 add r8,#4 445 add r9,#4 446 447outer_loop_4: 448 add r6,r1,r3 @pu1_dst + dst_strd 449 add r4,r12,r2 @pu1_src + src_strd 450 451 subs r5,r10,#0 @checks wd 452 ble end_inner_loop_4 453 454inner_loop_4: 455 vld1.u32 {d0},[r12],r11 @vector load pu1_src 456 vld1.u32 {d1},[r12],r11 457 vld1.u32 {d2},[r12],r11 458 vld1.u32 {d3},[r12],r11 459 vld1.u32 {d4},[r12],r11 460 vld1.u32 {d5},[r12],r11 461 vld1.u32 {d6},[r12],r11 462 vld1.u32 {d7},[r12],r11 463 @add r12,r12,#4 @increment the input pointer 464 sub r12,r12,#4 465 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 466 @vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] 467 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 468 469 @vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] 470 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 471 @vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] 472 @vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] 473 vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd 474 vld1.u32 {d13},[r4],r11 475 vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register 476 vld1.u32 {d14},[r4],r11 477 vzip.32 d1,d13 478 vld1.u32 {d15},[r4],r11 479 vzip.32 d2,d14 480 vld1.u32 {d16},[r4],r11 481 vzip.32 d3,d15 482 vld1.u32 {d17},[r4],r11 483 vzip.32 d4,d16 484 vld1.u32 {d18},[r4],r11 485 vzip.32 d5,d17 486 vld1.u32 {d19},[r4],r11 487 sub r4,r4,#4 488 @ add r4,r4,#4 @increment the input pointer 489 @ vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] 490 @ vext.u8 d15,d12,d13,#3 @vector extract of src[0_3] 491 @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 492 @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5] 493 @ vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 494 @ vext.u8 d19,d12,d13,#7 @vector extract of src[0_7] 495 @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1] 496 497 498 499 500 501 502 503 vzip.32 d6,d18 504 vzip.32 d7,d19 505 506 vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time 507 vmlsl.u8 q4,d0,d24 508 vmlsl.u8 q4,d2,d26 509 vmlal.u8 q4,d3,d27 510 vmlal.u8 q4,d4,d28 511 vmlsl.u8 q4,d5,d29 512 vmlal.u8 q4,d6,d30 513 vmlsl.u8 q4,d7,d31 514 515 vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result 516 vst1.32 {d8[0]},[r1]! @store the i iteration result which is in upper part of the register 517 vst1.32 {d8[1]},[r6]! @store the ii iteration result which is in lower part of the register 518 subs r5,r5,#4 @decrement the wd by 4 519 bgt inner_loop_4 520 521end_inner_loop_4: 522 subs r14,r14,#2 @decrement the ht by 4 523 add r12,r12,r9 @increment the input pointer 2*src_strd-wd 524 add r1,r1,r8 @increment the output pointer 2*dst_strd-wd 525 bgt outer_loop_4 526 @subs r7,r7,#1 527 @ bgt start_loop_count 528 529 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 530 531 532 533 534 535 536 537