1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* //file 21//* ihevc_inter_pred_chroma_horz_neon.s 22//* 23//* //brief 24//* contains function definitions for inter prediction interpolation. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* //author 30//* yogeswaran rs / akshaya mukund 31//* 32//* //par list of functions: 33//* 34//* 35//* //remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41//******************************************************************************* 42//* 43//* //brief 44//* chroma interprediction filter for horizontal input 45//* 46//* //par description: 47//* applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 48//* to the elements pointed by 'pu1_src' and writes to the location pointed 49//* by 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits 50//* assumptions : the function is optimized considering the fact width is 51//* multiple of 2,4 or 8. if width is 2, then height should be multiple of 2. 52//* width 4,8 is optimized further 53//* 54//* //param[in] pu1_src 55//* uword8 pointer to the source 56//* 57//* //param[out] pu1_dst 58//* uword8 pointer to the destination 59//* 60//* //param[in] src_strd 61//* integer source stride 62//* 63//* //param[in] dst_strd 64//* integer destination stride 65//* 66//* //param[in] pi1_coeff 67//* word8 pointer to the filter coefficients 68//* 69//* //param[in] ht 70//* integer height of the array 71//* 72//* //param[in] wd 73//* integer width of the array 74//* 75//* //returns 76//* 77//* //remarks 78//* none 79//* 80//******************************************************************************* 81//*/ 82 83//void ihevc_inter_pred_chroma_horz(uword8 *pu1_src, 84// uword8 *pu1_dst, 85// word32 src_strd, 86// word32 dst_strd, 87// word8 *pi1_coeff, 88// word32 ht, 89// word32 wd) 90//**************variables vs registers***************************************** 91//x0 => *pu1_src 92//x1 => *pi2_dst 93//x2 => src_strd 94//x3 => dst_strd 95 96.text 97.align 4 98 99.include "ihevc_neon_macros.s" 100 101.globl ihevc_inter_pred_chroma_horz_av8 102 103.type ihevc_inter_pred_chroma_horz_av8, %function 104 105ihevc_inter_pred_chroma_horz_av8: 106 107 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 108 109 stp d9,d10,[sp,#-16]! 110 stp d11,d12,[sp,#-16]! 111 stp d13,d14,[sp,#-16]! 112 stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error. 113 // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function. 114 stp x19, x20,[sp,#-16]! 115 116 mov x15,x4 // pi1_coeff 117 mov x16,x5 // ht 118 mov x17,x6 // wd 119 120 121 mov x4,x15 //loads pi1_coeff 122 mov x7,x16 //loads ht 123 mov x10,x17 //loads wd 124 125 ld1 {v0.8b},[x4] //coeff = vld1_s8(pi1_coeff) 126 subs x14,x7,#0 //checks for ht == 0 127 abs v2.8b, v0.8b //vabs_s8(coeff) 128 mov x11,#2 129 ble end_loops 130 131 dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0) 132 sub x12,x0,#2 //pu1_src - 2 133 dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1) 134 add x4,x12,x2 //pu1_src_tmp2_8 = pu1_src + src_strd 135 dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2) 136 137 tst x10,#3 //checks wd for multiples 138 lsl x5, x10, #1 139 140 dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3) 141 142 bne outer_loop_4 143 cmp x10,#12 144 beq skip_16 145 146 cmp x10,#8 147 bge outer_loop_16 148skip_16: 149 tst x7,#3 150 151 sub x9,x0,#2 152 beq outer_loop_ht_4 //jumps to else condition 153 154 b outer_loop_8 155 156 157outer_loop_16: 158 mov x10,x5 //2wd 159 mul x14, x14 , x10 160 161 sub x20,x3,#16 162 neg x6, x20 163 164 add x4,x12,x2 165 mov x9,#10 166 and x0, x12, #31 167 sub x20,x5,x3,lsl #1 168 neg x8, x20 169 add x20,x12, x2 , lsl #1 170 prfm PLDL1KEEP,[x20] 171 172 173 174 add x19,x12,#8 175 ld1 { v0.2s},[x12],x11 //vector load pu1_src 176 ld1 { v1.2s},[x19],x11 //vector load pu1_src 177 add x20,x4, x2 , lsl #1 178 prfm PLDL1KEEP,[x20] 179 180 ld1 { v2.2s},[x12],x11 //vector load pu1_src 181 ld1 { v3.2s},[x19],x11 //vector load pu1_src 182 183 ld1 { v4.2s},[x12],x11 //vector load pu1_src 184 ld1 { v5.2s},[x19],x11 //vector load pu1_src 185 186 ld1 { v6.2s},[x12],x9 //vector load pu1_src 187 ld1 { v7.2s},[x19],x9 //vector load pu1_src 188 189 190 add x19,x4,#8 191 umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 192 ld1 { v29.2s},[x4],x11 //vector load pu1_src 193 ld1 { v9.2s},[x19],x11 //vector load pu1_src 194 195 umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 196 197 ld1 { v10.2s},[x4],x11 //vector load pu1_src 198 ld1 { v11.2s},[x19],x11 //vector load pu1_src 199 200 umlal v30.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 201 202 ld1 { v12.2s},[x4],x11 //vector load pu1_src 203 ld1 { v13.2s},[x19],x11 //vector load pu1_src 204 205 umlsl v30.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 206 207 ld1 { v14.2s},[x4],x9 //vector load pu1_src 208 ld1 { v15.2s},[x19],x9 //vector load pu1_src 209 210 umull v28.8h, v3.8b, v25.8b 211 212 umlsl v28.8h, v1.8b, v24.8b 213 214 215 umlal v28.8h, v5.8b, v26.8b 216 217 umlsl v28.8h, v7.8b, v27.8b 218 219 220 cmp x14,#32 221 beq epilog_end 222 sub x14, x14,#64 223 224inner_loop_16: 225 226 227 228 229// bgt l_2 230 231// add x20,x12, x2 , lsl #1 232 prfm PLDL1KEEP,[x20] 233// add x20,x4, x2 , lsl #1 234 prfm PLDL1KEEP,[x20] 235 236 237 238 subs x10,x10,#16 239 240 umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 241 242 243 add x20,x12,x8 244 csel x12, x20, x12,eq 245 add x20,x12,x2 246 csel x4, x20, x4,eq 247 umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 248 249 250 251 add x20,x12, x2 , lsl #2 252 prfm PLDL1KEEP,[x20] 253 sqrshrun v30.8b, v30.8h,#6 254 255 add x19,x12,#8 256 ld1 { v0.2s},[x12],x11 //vector load pu1_src 257 ld1 { v1.2s},[x19],x11 //vector load pu1_src 258 259 sqrshrun v31.8b, v28.8h,#6 260 261 262 263 ld1 { v2.2s},[x12],x11 //vector load pu1_src 264 ld1 { v3.2s},[x19],x11 //vector load pu1_src 265 umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 266 267 268 269 270 ld1 { v4.2s},[x12],x11 //vector load pu1_src 271 ld1 { v5.2s},[x19],x11 //vector load pu1_src 272 umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 273 274 275 ld1 { v6.2s},[x12],x9 //vector load pu1_src 276 ld1 { v7.2s},[x19],x9 //vector load pu1_src 277 umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 278 279 add x20,x4, x2 , lsl #2 280 prfm PLDL1KEEP,[x20] 281 umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 282 283 //mov v30.s[1],v31.s[0] 284 add x13,x1,#8 285 st1 { v30.4h}, [x1],x3 286 st1 { v31.4h}, [x13],x3 287 umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 288 289 add x19,x4,#8 290 ld1 { v29.2s},[x4],x11 //vector load pu1_src 291 ld1 { v9.2s},[x19],x11 //vector load pu1_src 292 umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 293 294 295 ld1 { v10.2s},[x4],x11 //vector load pu1_src 296 ld1 { v11.2s},[x19],x11 //vector load pu1_src 297 umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 298 299 ld1 { v12.2s},[x4],x11 //vector load pu1_src 300 ld1 { v13.2s},[x19],x11 //vector load pu1_src 301 umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 302 303 ld1 { v14.2s},[x4],x9 //vector load pu1_src 304 ld1 { v15.2s},[x19],x11 //vector load pu1_src 305 umlal v30.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 306 307 cmp x10,#0 308 sqrshrun v22.8b, v22.8h,#6 309 sqrshrun v23.8b, v20.8h,#6 310 311 312 313 umlsl v30.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 314 315 csel x10, x5, x10,eq //2wd 316 umull v28.8h, v3.8b, v25.8b 317 318 319 //add x13,x1,#8 320 //mov v22.s[1],v23.s[0] 321 st1 { v22.4h},[x1],x6 //store the result pu1_dst 322 st1 { v23.4h},[x13],x6 //store the result pu1_dst 323 umlsl v28.8h, v1.8b, v24.8b 324 325 326 add x20,x1,x8 327 csel x1, x20, x1,eq 328 umlal v28.8h, v5.8b, v26.8b 329 330 subs x14,x14,#32 //decrement the ht loop 331 umlsl v28.8h, v7.8b, v27.8b 332 333// mov x0, x7 334 335 bgt inner_loop_16 336 337 338 339 add x14,x14,#64 340 cmp x14,#32 341 beq epilog_end 342 343epilog: 344 sqrshrun v30.8b, v30.8h,#6 345 sqrshrun v31.8b, v28.8h,#6 346 347 348 349 add x13,x1,#8 350 //mov v30.s[1],v31.s[0] 351 st1 { v30.4h}, [x1],x3 352 st1 { v31.4h}, [x13],x3 353 354 umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 355 356 357 358 359 umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 360 subs x10,x10,#16 //decrement the wd loop 361 umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 362 add x20,x12,x8 363 csel x12, x20, x12,eq 364 umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 365 csel x10, x5, x10,eq //2wd 366 367 368 add x20,x12,x2 369 csel x4, x20, x4,eq 370 umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 371 372 add x19,x12,#8 373 ld1 { v0.2s},[x12],x11 //vector load pu1_src 374 ld1 { v1.2s},[x19],x11 //vector load pu1_src 375 376 umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 377 ld1 { v2.2s},[x12],x11 //vector load pu1_src 378 ld1 { v3.2s},[x19],x11 //vector load pu1_src 379 umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 380 381 ld1 { v4.2s},[x12],x11 //vector load pu1_src 382 ld1 { v5.2s},[x19],x11 //vector load pu1_src 383 384 umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 385 ld1 { v6.2s},[x12],x9 //vector load pu1_src 386 ld1 { v7.2s},[x19],x9 //vector load pu1_src 387 umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 388 389 390 add x19,x4,#8 391 ld1 { v29.2s},[x4],x11 //vector load pu1_src 392 ld1 { v9.2s},[x19],x11 //vector load pu1_src 393 umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 394 ld1 { v10.2s},[x4],x11 //vector load pu1_src 395 ld1 { v11.2s},[x19],x11 //vector load pu1_src 396 umlal v30.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 397 398 umlsl v30.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 399 400 ld1 { v12.2s},[x4],x11 //vector load pu1_src 401 ld1 { v13.2s},[x19],x11 //vector load pu1_src 402 umull v28.8h, v3.8b, v25.8b 403 ld1 { v14.2s},[x4],x9 //vector load pu1_src 404 ld1 { v15.2s},[x19],x9 //vector load pu1_src 405 umlsl v28.8h, v1.8b, v24.8b 406 sqrshrun v22.8b, v22.8h,#6 407 sqrshrun v23.8b, v20.8h,#6 408 409 //mov v22.s[1],v23.s[0] 410 st1 { v22.4h},[x1],x6 //store the result pu1_dst 411 st1 { v23.4h},[x13],x6 //store the result pu1_dst 412 umlal v28.8h, v5.8b, v26.8b 413 414 umlsl v28.8h, v7.8b, v27.8b 415 add x20,x1,x8 416 csel x1, x20, x1,eq 417 418 419 420epilog_end: 421 sqrshrun v30.8b, v30.8h,#6 422 sqrshrun v31.8b, v28.8h,#6 423 424 425 umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 426 umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 427 umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 428 umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 429 430 431 umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 432 umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 433 umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 434 umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 435 sqrshrun v22.8b, v22.8h,#6 436 sqrshrun v23.8b, v20.8h,#6 437 438 add x13,x1,#8 439 440 //mov v30.s[1],v31.s[0] 441 st1 { v30.4h}, [x1],x3 442 st1 { v31.4h}, [x13],x3 443 444 //mov v22.s[1],v23.s[0] 445 st1 { v22.4h},[x1] //store the result pu1_dst 446 st1 { v23.4h},[x13] //store the result pu1_dst 447 448 449 450 b end_loops 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470outer_loop_8: 471 472 473 add x6,x1,x3 //pu1_dst + dst_strd 474 mov x7,x5 475 add x4,x12,x2 //pu1_src + src_strd 476 477 478inner_loop_8: 479 //ld1 {v0.2s, v1.2s},[x12],x11 //vector load pu1_src 480 ld1 {v0.2s},[x12],x11 //vector load pu1_src 481 ld1 {v1.2s},[x12],x11 //vector load pu1_src 482 ld1 {v2.2s},[x12],x11 //vector load pu1_src 483 ld1 {v3.2s},[x12],x11 //vector load pu1_src 484 485 //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2] 486 umull v29.8h, v1.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 487 umlsl v29.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 488 //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4] 489 //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6] 490 umlal v29.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 491 umlsl v29.8h, v3.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 492 493 ld1 {v4.2s},[x4],x11 //vector load pu1_src 494 ld1 {v5.2s},[x4],x11 //vector load pu1_src 495 ld1 {v6.2s},[x4],x11 //vector load pu1_src 496 ld1 {v7.2s},[x4],x11 //vector load pu1_src 497 //ld1 {v12.2s, v13.2s},[x4],x11 //vector load pu1_src + src_strd 498 //vext.u8 d14,d12,d13,#2 //vector extract of src[0_2] 499 umull v10.8h, v5.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 500 umlsl v10.8h, v4.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 501 //vext.u8 d16,d12,d13,#4 //vector extract of src[0_4] 502 //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6] 503 sqrshrun v29.8b, v29.8h,#6 //right shift and saturating narrow result 1 504 umlal v10.8h, v6.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 505 umlsl v10.8h, v7.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 506 507 st1 {v29.8b},[x1],#8 //store the result pu1_dst 508 509 sqrshrun v10.8b, v10.8h,#6 //right shift and saturating narrow result 2 510 subs x7,x7,#8 //decrement the wd loop 511 st1 {v10.8b},[x6],#8 //store the result pu1_dst 512 bgt inner_loop_8 513 514 sub x12,x12,x5 515 subs x14,x14,#2 //decrement the ht loop 516 sub x1,x1,x5 517 add x12,x12,x2,lsl #1 518 add x1,x1,x3,lsl #1 519 bgt outer_loop_8 520 b end_loops 521 522//height if 4 comes 523outer_loop_ht_4: 524 525 mov x7,x5 526 527prologue_ht_4: 528 529inner_loop_ht_4: 530 531 mov x12,x9 532 mov x4,x1 533 534 sub x8, x2, #6 535 536 ld1 {v0.2s},[x12],x11 //(1)vector load pu1_src 537 ld1 {v1.2s},[x12],x11 //(1)vector load pu1_src 538 ld1 {v2.2s},[x12],x11 //(1)vector load pu1_src 539 //ld1 {v3.2s},[x12],x2 //(1)vector load pu1_src 540 ld1 {v3.2s},[x12],x8 //(1)vector load pu1_src 541 542 //sub x12, x12, #6 //(1) 543 544 ld1 {v4.2s},[x12],x11 //(2)vector load pu1_src 545 ld1 {v5.2s},[x12],x11 //(2)vector load pu1_src 546 ld1 {v6.2s},[x12],x11 //(2)vector load pu1_src 547 //ld1 {v7.2s},[x12],x2 //(2)vector load pu1_src 548 ld1 {v7.2s},[x12],x8 //(2)vector load pu1_src 549 550 //sub x12, x12, #6 //(2) 551 552 ld1 {v14.2s},[x12],x11 //(3)vector load pu1_src 553 umull v29.8h, v1.8b, v25.8b //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)// 554 555 ld1 {v15.2s},[x12],x11 //(3)vector load pu1_src 556 umlsl v29.8h, v0.8b, v24.8b //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 557 558 ld1 {v16.2s},[x12],x11 //(3)vector load pu1_src 559 umlal v29.8h, v2.8b, v26.8b //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 560 561 //ld1 {v17.2s},[x12],x2 //(3)vector load pu1_src 562 ld1 {v17.2s},[x12],x8 //(3)vector load pu1_src 563 umlsl v29.8h, v3.8b, v27.8b //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 564 565 //sub x12, x12, #6 //(3) 566 umull v10.8h, v5.8b, v25.8b //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)// 567 568 ld1 {v18.2s},[x12],x11 //(4)vector load pu1_src 569 umlsl v10.8h, v4.8b, v24.8b //(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 570 571 ld1 {v19.2s},[x12],x11 //(4)vector load pu1_src 572 umlal v10.8h, v6.8b, v26.8b //(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 573 574 ld1 {v20.2s},[x12],x11 //(4)vector load pu1_src 575 umlsl v10.8h, v7.8b, v27.8b //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 576 577 ld1 {v21.2s},[x12],x2 //(4)vector load pu1_src 578 sqrshrun v29.8b, v29.8h,#6 //(1)right shift and saturating narrow result 1 579 580 add x9,x9,#8 //(core loop) 581 582 subs x7,x7,#8 //(prologue)decrement the wd loop 583 beq epilogue 584 585core_loop: 586 mov x12,x9 587 588 ld1 {v0.2s},[x12],x11 //(1_1)vector load pu1_src 589 umull v12.8h, v15.8b, v25.8b //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)// 590 591 ld1 {v1.2s},[x12],x11 //(1_1)vector load pu1_src 592 umlsl v12.8h, v14.8b, v24.8b //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 593 594 ld1 {v2.2s},[x12],x11 //(1_1)vector load pu1_src 595 umlal v12.8h, v16.8b, v26.8b //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 596 597 //ld1 {v3.2s},[x12],x2 //(1_1)vector load pu1_src 598 ld1 {v3.2s},[x12],x8 //(1_1)vector load pu1_src 599 umlsl v12.8h, v17.8b, v27.8b //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 600 601 //sub x12, x12, #6 //(1_1) 602 603 st1 {v29.8b},[x4],x3 //(1)store the result pu1_dst 604 sqrshrun v10.8b, v10.8h,#6 //(2)right shift and saturating narrow result 2 605 606 ld1 {v4.2s},[x12],x11 //(2_1)vector load pu1_src 607 umull v22.8h, v19.8b, v25.8b //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)// 608 609 ld1 {v5.2s},[x12],x11 //(2_1)vector load pu1_src 610 umlsl v22.8h, v18.8b, v24.8b //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 611 612 ld1 {v6.2s},[x12],x11 //(2_1)vector load pu1_src 613 umlal v22.8h, v20.8b, v26.8b //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 614 615 //ld1 {v7.2s},[x12],x2 //(2_1)vector load pu1_src 616 ld1 {v7.2s},[x12],x8 //(2_1)vector load pu1_src 617 umlsl v22.8h, v21.8b, v27.8b //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 618 619 //sub x12, x12, #6 //(2_1) 620 621 st1 {v10.8b},[x4],x3 //(2)store the result pu1_dst 622 sqrshrun v12.8b, v12.8h,#6 //(3)right shift and saturating narrow result 1 623 624 ld1 {v14.2s},[x12],x11 //(3_1)vector load pu1_src 625 umull v29.8h, v1.8b, v25.8b //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)// 626 627 ld1 {v15.2s},[x12],x11 //(3_1)vector load pu1_src 628 umlsl v29.8h, v0.8b, v24.8b //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 629 630 ld1 {v16.2s},[x12],x11 //(3_1)vector load pu1_src 631 umlal v29.8h, v2.8b, v26.8b //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 632 633 //ld1 {v17.2s},[x12],x2 //(3_1)vector load pu1_src 634 ld1 {v17.2s},[x12],x8 //(3_1)vector load pu1_src 635 umlsl v29.8h, v3.8b, v27.8b //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 636 637 //sub x12, x12, #6 //(3_1) 638 639 st1 {v12.8b},[x4],x3 //(3)store the result pu1_dst 640 sqrshrun v22.8b, v22.8h,#6 //(4)right shift and saturating narrow result 2 641 642 add x9,x9,#8 //(core loop) 643 644 umull v10.8h, v5.8b, v25.8b //(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)// 645 ld1 {v18.2s},[x12],x11 //(4_1)vector load pu1_src 646 647 ld1 {v19.2s},[x12],x11 //(4_1)vector load pu1_src 648 umlsl v10.8h, v4.8b, v24.8b //(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 649 650 ld1 {v20.2s},[x12],x11 //(4_1)vector load pu1_src 651 umlal v10.8h, v6.8b, v26.8b //(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 652 653 ld1 {v21.2s},[x12],x2 //(4_1)vector load pu1_src 654 umlsl v10.8h, v7.8b, v27.8b //(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 655 656 add x1,x1,#8 //(core loop) 657 658 subs x7,x7,#8 //(core loop) 659 660 st1 {v22.8b},[x4], x3 //(4)store the result pu1_dst 661 sqrshrun v29.8b, v29.8h,#6 //(1_1)right shift and saturating narrow result 1 662 663 mov x4, x1 //(core loop) 664 665 bgt core_loop //loopback 666 667epilogue: 668 umull v12.8h, v15.8b, v25.8b //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)// 669 670 umlsl v12.8h, v14.8b, v24.8b //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 671 672 umlal v12.8h, v16.8b, v26.8b //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 673 674 umlsl v12.8h, v17.8b, v27.8b //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 675 676 st1 {v29.8b},[x4],x3 //(1)store the result pu1_dst 677 sqrshrun v10.8b, v10.8h,#6 //(2)right shift and saturating narrow result 2 678 679 umull v22.8h, v19.8b, v25.8b //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)// 680 umlsl v22.8h, v18.8b, v24.8b //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 681 682 umlal v22.8h, v20.8b, v26.8b //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 683 684 umlsl v22.8h, v21.8b, v27.8b //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 685 686 st1 {v10.8b},[x4],x3 //(2)store the result pu1_dst 687 sqrshrun v12.8b, v12.8h,#6 //(3)right shift and saturating narrow result 1 688 689 st1 {v12.8b},[x4],x3 //(3)store the result pu1_dst 690 691 add x1,x1,#8 //(core loop) 692 693 sqrshrun v22.8b, v22.8h,#6 //(4)right shift and saturating narrow result 2 694 695 696 st1 {v22.8b},[x4], x3 //(4)store the result pu1_dst 697 698 sub x9,x9,x5 699 subs x14,x14,#4 //decrement the ht loop 700 sub x1,x1,x5 701 add x9,x9,x2,lsl #2 702 add x1,x1,x3,lsl #2 703 bgt outer_loop_ht_4 704 b end_loops 705 706outer_loop_4: 707 add x6,x1,x3 //pu1_dst + dst_strd 708 mov x7,x5 709 add x4,x12,x2 //pu1_src + src_strd 710 711inner_loop_4: 712 //ld1 {v0.2s, v1.2s},[x12] //vector load pu1_src 713 714 ld1 {v20.2s},[x12],x11 //vector load pu1_src 715 ld1 {v21.2s},[x12],x11 //vector load pu1_src 716 ld1 {v22.2s},[x12],x11 //vector load pu1_src 717 ld1 {v23.2s},[x12] //vector load pu1_src 718 719 sub x12,x12,#2 //increment the input pointer 720 ld1 {v16.2s},[x4],x11 //vector load pu1_src 721 ld1 {v17.2s},[x4],x11 //vector load pu1_src 722 ld1 {v18.2s},[x4],x11 //vector load pu1_src 723 ld1 {v19.2s},[x4] //vector load pu1_src 724 //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2] 725 //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4] 726 //ld1 {v12.2s, v13.2s},[x4] //vector load pu1_src + src_strd 727 //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6] 728 729 sub x4,x4,#2 //increment the input pointer 730 //vext.u8 d14,d12,d13,#2 //vector extract of src[0_2] 731 //vext.u8 d16,d12,d13,#4 //vector extract of src[0_4] 732 //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6] 733 734 zip1 v0.2s, v20.2s, v16.2s 735 zip2 v4.2s, v20.2s, v16.2s //vector zip the i iteration and ii interation in single register 736 zip1 v1.2s, v21.2s, v17.2s 737 zip2 v5.2s, v21.2s, v17.2s 738 zip1 v2.2s, v22.2s, v18.2s 739 zip2 v6.2s, v22.2s, v18.2s 740 zip1 v3.2s, v23.2s, v19.2s 741 zip2 v7.2s, v23.2s, v19.2s 742 743 umull v29.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time 744 umlsl v29.8h, v0.8b, v24.8b 745 umlal v29.8h, v2.8b, v26.8b 746 umlsl v29.8h, v3.8b, v27.8b 747 748 sqrshrun v29.8b, v29.8h,#6 //narrow right shift and saturating the result 749 st1 {v29.s}[0],[x1],#4 //store the i iteration result which is in upper part of the register 750 subs x7,x7,#4 //decrement the wd by 4 751 752 st1 {v29.s}[1],[x6],#4 //store the ii iteration result which is in lower part of the register 753 754 bgt inner_loop_4 755 756 sub x12,x12,x5 757 subs x14,x14,#2 //decrement the ht by 2 758 sub x1,x1,x5 759 add x12,x12,x2,lsl #1 760 add x1,x1,x3,lsl #1 761 bgt outer_loop_4 762 763end_loops: 764 765 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 766 ldp x19, x20,[sp],#16 767 ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error. 768 // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function. 769 ldp d13,d14,[sp],#16 770 ldp d11,d12,[sp],#16 771 ldp d9,d10,[sp],#16 772 ret 773 774 775 776 777 778 779 780 781