ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s revision 25e8adb631df325607216ad6f3d6638442d9f453
1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s 24//* 25//* @brief 26//* Contains function definitions for inter prediction interpolation. 27//* 28//* @author 29//* Mohit 30//* 31//* @par List of Functions: 32//* 33//* - ih264_inter_pred_luma_horz_hpel_vert_hpel_av8() 34//* 35//* @remarks 36//* None 37//* 38//******************************************************************************* 39//*/ 40 41 42 43//void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src, 44// UWORD8 *pu1_dst, 45// WORD32 src_strd,, 46// WORD32 dst_strd, 47// WORD32 ht, 48// WORD32 wd, 49// UWORD8* pu1_tmp, 50// UWORD32 dydx) 51 52//**************Variables Vs Registers***************************************** 53// x0 => *pu1_src 54// x1 => *pu1_dst 55// x2 => src_strd 56// x3 => dst_strd 57// x4 => ht 58// x5 => wd 59 60 61.text 62.p2align 2 63.include "ih264_neon_macros.s" 64 65 66 67 .global ih264_inter_pred_luma_horz_hpel_vert_hpel_av8 68 69ih264_inter_pred_luma_horz_hpel_vert_hpel_av8: 70 71 //store register values to stack 72 push_v_regs 73 stp x19, x20, [sp, #-16]! 74 75 sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd 76 sub x0, x0, #2 //pu1_src-2 77 78 movi v26.8h, #0x14 // Filter coeff 20 into Q13 79 movi v24.8h, #0x5 // Filter coeff 5 into Q12 80 movi v27.8h, #0x14 // Filter coeff 20 into Q13 81 movi v25.8h, #0x5 // Filter coeff 5 into Q12 82 mov x7, #0x20 83 mov x8, #0x30 84 subs x12, x5, #4 //if wd=4 branch to loop_4 85 beq loop_4_start 86 87 subs x12, x5, #8 //if wd=8 branch to loop_8 88 beq loop_8_start 89 90 //when wd=16 91 movi v28.8h, #0x14 // Filter coeff 20 into Q13 92 movi v30.8h, #0x5 // Filter coeff 5 into Q12 93 sub x2, x2, #16 94 ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[0_0] 95 ld1 {v12.2s}, [x0], x2 // Vector load from src[0_0] 96 ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[1_0] 97 ld1 {v13.2s}, [x0], x2 // Vector load from src[1_0] 98 ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[2_0] 99 ld1 {v14.2s}, [x0], x2 // Vector load from src[2_0] 100 ld1 {v6.2s, v7.2s}, [x0], #16 // Vector load from src[3_0] 101 ld1 {v15.2s}, [x0], x2 // Vector load from src[3_0] 102 ld1 {v8.2s, v9.2s}, [x0], #16 // Vector load from src[4_0] 103 ld1 {v16.2s}, [x0], x2 // Vector load from src[4_0] 104loop_16: 105 106 ld1 {v10.2s, v11.2s}, [x0], #16 // Vector load from src[5_0] 107 ld1 {v17.2s}, [x0], x2 // Vector load from src[5_0] 108 109 110 uaddl v20.8h, v4.8b, v6.8b 111 uaddl v18.8h, v0.8b, v10.8b 112 uaddl v22.8h, v2.8b, v8.8b 113 mla v18.8h, v20.8h , v28.8h 114 uaddl v24.8h, v5.8b, v7.8b 115 uaddl v20.8h, v1.8b, v11.8b 116 uaddl v26.8h, v3.8b, v9.8b 117 mla v20.8h, v24.8h , v28.8h 118 uaddl v24.8h, v14.8b, v15.8b 119 mls v18.8h, v22.8h , v30.8h 120 uaddl v22.8h, v12.8b, v17.8b 121 mls v20.8h, v26.8h , v30.8h 122 uaddl v26.8h, v13.8b, v16.8b 123 mla v22.8h, v24.8h , v28.8h 124 mls v22.8h, v26.8h , v30.8h 125 126 ext v24.16b, v18.16b , v20.16b , #4 127 ext v26.16b, v18.16b , v20.16b , #6 128 129 ext v23.16b, v18.16b , v20.16b , #10 130 add v0.8h, v24.8h , v26.8h 131 ext v24.16b, v18.16b , v20.16b , #2 132 ext v26.16b, v18.16b , v20.16b , #8 133 add v24.8h, v24.8h , v26.8h 134 135 saddl v26.4s, v18.4h, v23.4h 136 smlal v26.4s, v0.4h, v28.4h 137 smlsl v26.4s, v24.4h, v30.4h 138 139 saddl2 v23.4s, v18.8h, v23.8h 140 smlal2 v23.4s, v0.8h, v28.8h 141 smlsl2 v23.4s, v24.8h, v30.8h 142 143 sqrshrun v18.4h, v26.4s, #10 144 sqrshrun v19.4h, v23.4s, #10 145 146 147 uqxtn v18.8b, v18.8h 148 uqxtn v19.8b, v19.8h 149 mov v18.2s[1], v19.2s[0] 150 151 ext v24.16b, v20.16b , v22.16b , #4 152 ext v26.16b, v20.16b , v22.16b , #6 153 ext v0.16b, v20.16b , v22.16b , #10 154 155 add v25.8h, v24.8h , v26.8h 156 ext v24.16b, v20.16b , v22.16b , #2 157 ext v26.16b, v20.16b , v22.16b , #8 158 add v24.8h, v24.8h , v26.8h 159 160 saddl v26.4s, v0.4h, v20.4h 161 smlal v26.4s, v25.4h, v28.4h 162 smlsl v26.4s, v24.4h, v30.4h 163 164 saddl2 v22.4s, v0.8h, v20.8h 165 smlal2 v22.4s, v25.8h, v28.8h 166 smlsl2 v22.4s, v24.8h, v30.8h 167 168 sqrshrun v19.4h, v26.4s, #10 169 sqrshrun v25.4h, v22.4s, #10 170 171 uaddl v24.8h, v7.8b, v9.8b 172 173 174 175 uqxtn v19.8b, v19.8h 176 uqxtn v25.8b, v25.8h 177 mov v19.2s[1], v25.2s[0] 178 179 uaddl v22.8h, v4.8b, v10.8b 180 ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0] 181 182 183 ld1 {v12.2s}, [x0], x2 // Vector load from src[6_0] 184 uaddl v20.8h, v6.8b, v8.8b 185 uaddl v26.8h, v5.8b, v11.8b 186 st1 {v18.2s, v19.2s}, [x1], x3 // store row 0 187 188 189//ROW_2 190 191 192 uaddl v18.8h, v2.8b, v0.8b 193 194 mla v18.8h, v20.8h , v28.8h 195 196 uaddl v20.8h, v3.8b, v1.8b 197 198 mla v20.8h, v24.8h , v28.8h 199 uaddl v24.8h, v15.8b, v16.8b 200 mls v18.8h, v22.8h , v30.8h 201 uaddl v22.8h, v13.8b, v12.8b 202 mls v20.8h, v26.8h , v30.8h 203 uaddl v26.8h, v14.8b, v17.8b 204 mla v22.8h, v24.8h , v28.8h 205 mls v22.8h, v26.8h , v30.8h 206 207 ext v24.16b, v18.16b , v20.16b , #4 208 ext v26.16b, v18.16b , v20.16b , #6 209 210 ext v23.16b, v18.16b , v20.16b , #10 211 add v2.8h, v24.8h , v26.8h 212 ext v24.16b, v18.16b , v20.16b , #2 213 ext v26.16b, v18.16b , v20.16b , #8 214 add v24.8h, v24.8h , v26.8h 215 216 saddl v26.4s, v18.4h, v23.4h 217 smlal v26.4s, v2.4h, v28.4h 218 smlsl v26.4s, v24.4h, v30.4h 219 220 saddl2 v23.4s, v18.8h, v23.8h 221 smlal2 v23.4s, v2.8h, v28.8h 222 smlsl2 v23.4s, v24.8h, v30.8h 223 224 sqrshrun v18.4h, v26.4s, #10 225 sqrshrun v19.4h, v23.4s, #10 226 227 228 229 uqxtn v18.8b, v18.8h 230 uqxtn v19.8b, v19.8h 231 mov v18.2s[1], v19.2s[0] 232 233 ext v24.16b, v20.16b , v22.16b , #4 234 ext v26.16b, v20.16b , v22.16b , #6 235 ext v2.16b, v20.16b , v22.16b , #10 236 237 add v25.8h, v24.8h , v26.8h 238 ext v24.16b, v20.16b , v22.16b , #2 239 ext v26.16b, v20.16b , v22.16b , #8 240 add v24.8h, v24.8h , v26.8h 241 242 saddl v26.4s, v2.4h, v20.4h 243 smlal v26.4s, v25.4h, v28.4h 244 smlsl v26.4s, v24.4h, v30.4h 245 246 saddl2 v22.4s, v2.8h, v20.8h 247 smlal2 v22.4s, v25.8h, v28.8h 248 smlsl2 v22.4s, v24.8h, v30.8h 249 250 sqrshrun v19.4h, v26.4s, #10 251 sqrshrun v25.4h, v22.4s, #10 252 uaddl v24.8h, v9.8b, v11.8b 253 254 uqxtn v19.8b, v19.8h 255 uqxtn v25.8b, v25.8h 256 mov v19.2s[1], v25.2s[0] 257 258 259 uaddl v22.8h, v6.8b, v0.8b 260 ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[7_0] 261 262 263 ld1 {v13.2s}, [x0], x2 // Vector load from src[7_0] 264 uaddl v20.8h, v8.8b, v10.8b 265 uaddl v26.8h, v7.8b, v1.8b 266 st1 {v18.2s, v19.2s}, [x1], x3 // store row 1 267 268//ROW_3 269 270 271 uaddl v18.8h, v4.8b, v2.8b 272 273 mla v18.8h, v20.8h , v28.8h 274 275 uaddl v20.8h, v5.8b, v3.8b 276 277 mla v20.8h, v24.8h , v28.8h 278 uaddl v24.8h, v16.8b, v17.8b 279 mls v18.8h, v22.8h , v30.8h 280 uaddl v22.8h, v14.8b, v13.8b 281 mls v20.8h, v26.8h , v30.8h 282 uaddl v26.8h, v15.8b, v12.8b 283 mla v22.8h, v24.8h , v28.8h 284 mls v22.8h, v26.8h , v30.8h 285 286 ext v24.16b, v18.16b , v20.16b , #4 287 ext v26.16b, v18.16b , v20.16b , #6 288 289 ext v23.16b, v18.16b , v20.16b , #10 290 add v4.8h, v24.8h , v26.8h 291 ext v24.16b, v18.16b , v20.16b , #2 292 ext v26.16b, v18.16b , v20.16b , #8 293 add v24.8h, v24.8h , v26.8h 294 295 saddl v26.4s, v18.4h, v23.4h 296 smlal v26.4s, v4.4h, v28.4h 297 smlsl v26.4s, v24.4h, v30.4h 298 299 saddl2 v23.4s, v18.8h, v23.8h 300 smlal2 v23.4s, v4.8h, v28.8h 301 smlsl2 v23.4s, v24.8h, v30.8h 302 303 sqrshrun v18.4h, v26.4s, #10 304 sqrshrun v19.4h, v23.4s, #10 305 306 307 uqxtn v18.8b, v18.8h 308 uqxtn v19.8b, v19.8h 309 mov v18.2s[1], v19.2s[0] 310 311 312 ext v24.16b, v20.16b , v22.16b , #4 313 ext v26.16b, v20.16b , v22.16b , #6 314 ext v4.16b, v20.16b , v22.16b , #10 315 316 add v25.8h, v24.8h , v26.8h 317 ext v24.16b, v20.16b , v22.16b , #2 318 ext v26.16b, v20.16b , v22.16b , #8 319 add v24.8h, v24.8h , v26.8h 320 321 saddl v26.4s, v4.4h, v20.4h 322 smlal v26.4s, v25.4h, v28.4h 323 smlsl v26.4s, v24.4h, v30.4h 324 325 saddl2 v22.4s, v4.8h, v20.8h 326 smlal2 v22.4s, v25.8h, v28.8h 327 smlsl2 v22.4s, v24.8h, v30.8h 328 329 sqrshrun v19.4h, v26.4s, #10 330 sqrshrun v25.4h, v22.4s, #10 331 332 uaddl v24.8h, v11.8b, v1.8b 333 334 335 uqxtn v19.8b, v19.8h 336 uqxtn v25.8b, v25.8h 337 mov v19.2s[1], v25.2s[0] 338 339 340 341 uaddl v22.8h, v8.8b, v2.8b 342 ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[8_0] 343 344 345 ld1 {v14.2s}, [x0], x2 // Vector load from src[8_0] 346 uaddl v20.8h, v10.8b, v0.8b 347 uaddl v26.8h, v9.8b, v3.8b 348 st1 {v18.2s, v19.2s}, [x1], x3 // store row 2 349 350 351//ROW_4 352 353 uaddl v18.8h, v6.8b, v4.8b 354 355 mla v18.8h, v20.8h , v28.8h 356 357 uaddl v20.8h, v7.8b, v5.8b 358 359 mla v20.8h, v24.8h , v28.8h 360 uaddl v24.8h, v17.8b, v12.8b 361 mls v18.8h, v22.8h , v30.8h 362 uaddl v22.8h, v15.8b, v14.8b 363 mls v20.8h, v26.8h , v30.8h 364 uaddl v26.8h, v16.8b, v13.8b 365 mla v22.8h, v24.8h , v28.8h 366 mls v22.8h, v26.8h , v30.8h 367 368 ext v24.16b, v18.16b , v20.16b , #4 369 ext v26.16b, v18.16b , v20.16b , #6 370 371 ext v23.16b, v18.16b , v20.16b , #10 372 add v6.8h, v24.8h , v26.8h 373 ext v24.16b, v18.16b , v20.16b , #2 374 ext v26.16b, v18.16b , v20.16b , #8 375 add v24.8h, v24.8h , v26.8h 376 377 saddl v26.4s, v18.4h, v23.4h 378 smlal v26.4s, v6.4h, v28.4h 379 smlsl v26.4s, v24.4h, v30.4h 380 381 saddl2 v23.4s, v18.8h, v23.8h 382 smlal2 v23.4s, v6.8h, v28.8h 383 smlsl2 v23.4s, v24.8h, v30.8h 384 385 sqrshrun v18.4h, v26.4s, #10 386 sqrshrun v19.4h, v23.4s, #10 387 388 uqxtn v18.8b, v18.8h 389 uqxtn v19.8b, v19.8h 390 mov v18.2s[1], v19.2s[0] 391 392 393 ext v24.16b, v20.16b , v22.16b , #4 394 ext v26.16b, v20.16b , v22.16b , #6 395 ext v6.16b, v20.16b , v22.16b , #10 396 397 add v25.8h, v24.8h , v26.8h 398 ext v24.16b, v20.16b , v22.16b , #2 399 ext v26.16b, v20.16b , v22.16b , #8 400 add v24.8h, v24.8h , v26.8h 401 402 saddl v26.4s, v6.4h, v20.4h 403 smlal v26.4s, v25.4h, v28.4h 404 smlsl v26.4s, v24.4h, v30.4h 405 406 saddl2 v22.4s, v6.8h, v20.8h 407 smlal2 v22.4s, v25.8h, v28.8h 408 smlsl2 v22.4s, v24.8h, v30.8h 409 410 mov v6.16b, v2.16b 411 mov v7.16b, v3.16b 412 413 mov v2.16b, v10.16b 414 mov v3.16b, v11.16b 415 416 subs x4, x4, #4 417 sqrshrun v19.4h, v26.4s, #10 418 sqrshrun v25.4h, v22.4s, #10 419 mov v10.16b, v0.16b 420 mov v11.16b, v1.16b 421 422 mov v24.8b, v14.8b 423 424 mov v14.16b, v12.16b 425 mov v15.16b, v13.16b 426 427 428 uqxtn v19.8b, v19.8h 429 uqxtn v25.8b, v25.8h 430 mov v19.2s[1], v25.2s[0] 431 432 433 434 mov v0.16b, v8.16b 435 mov v1.16b, v9.16b 436 437 mov v8.16b, v4.16b 438 mov v9.16b, v5.16b 439 440 mov v12.16b, v16.16b 441 mov v13.16b, v17.16b 442 443 mov v4.16b, v10.16b 444 mov v5.16b, v11.16b 445 446 mov v16.8b, v24.8b 447 st1 {v18.2s, v19.2s}, [x1], x3 // store row 3 448 449 bgt loop_16 // looping if height =16 450 b end_func 451 452loop_8_start: 453 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] 454 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] 455 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] 456 ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] 457 ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] 458 459loop_8: 460 461 ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] 462 uaddl v14.8h, v4.8b, v6.8b 463 uaddl v12.8h, v0.8b, v10.8b 464 uaddl v16.8h, v2.8b, v8.8b 465 mla v12.8h, v14.8h , v26.8h 466 uaddl v18.8h, v5.8b, v7.8b 467 uaddl v14.8h, v1.8b, v11.8b 468 uaddl v22.8h, v3.8b, v9.8b 469 mla v14.8h, v18.8h , v26.8h 470 mls v12.8h, v16.8h , v24.8h 471 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0] 472 uaddl v16.8h, v6.8b, v8.8b 473 mls v14.8h, v22.8h , v24.8h 474 uaddl v28.8h, v2.8b, v0.8b 475 476 ext v22.16b, v12.16b , v14.16b , #10 477 uaddl v18.8h, v4.8b, v10.8b 478 mla v28.8h, v16.8h , v26.8h 479 saddl v30.4s, v12.4h, v22.4h 480 481 saddl2 v22.4s, v12.8h, v22.8h 482 ext v16.16b, v12.16b , v14.16b , #4 483 mls v28.8h, v18.8h , v24.8h 484 ext v18.16b, v12.16b , v14.16b , #6 485 ext v20.16b, v12.16b , v14.16b , #8 486 ext v14.16b, v12.16b , v14.16b , #2 487 add v16.8h, v16.8h , v18.8h 488 add v18.8h, v14.8h , v20.8h 489 uaddl v20.8h, v7.8b, v9.8b 490 smlal v30.4s, v16.4h, v26.4h 491 smlsl v30.4s, v18.4h, v24.4h 492 smlal2 v22.4s, v16.8h, v26.8h 493 smlsl2 v22.4s, v18.8h, v24.8h 494 uaddl v14.8h, v3.8b, v1.8b 495 496 mla v14.8h, v20.8h , v26.8h 497 sqrshrun v12.4h, v30.4s, #10 498 uaddl v16.8h, v5.8b, v11.8b 499 sqrshrun v13.4h, v22.4s, #10 500 mls v14.8h, v16.8h , v24.8h 501 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0] 502 uqxtn v25.8b, v12.8h 503 uqxtn v13.8b, v13.8h 504 mov v25.2s[1], v13.2s[0] 505 uaddl v16.8h, v8.8b, v10.8b 506 507 508 ext v22.16b, v28.16b , v14.16b , #10 509 uaddl v20.8h, v4.8b, v2.8b 510 saddl v30.4s, v28.4h, v22.4h 511 mla v20.8h, v16.8h , v26.8h 512 513 saddl2 v22.4s, v28.8h, v22.8h 514 ext v16.16b, v28.16b , v14.16b , #4 515 ext v18.16b, v28.16b , v14.16b , #6 516 ext v12.16b, v28.16b , v14.16b , #8 517 ext v14.16b, v28.16b , v14.16b , #2 518 add v16.8h, v16.8h , v18.8h 519 add v18.8h, v12.8h , v14.8h 520 521 smlal v30.4s, v16.4h, v26.4h 522 smlsl v30.4s, v18.4h, v24.4h 523 smlal2 v22.4s, v16.8h, v26.8h 524 smlsl2 v22.4s, v18.8h, v24.8h 525 526 527 uaddl v18.8h, v6.8b, v0.8b 528 sqrshrun v16.4h, v30.4s, #10 529 530 sqrshrun v17.4h, v22.4s, #10 531 532 mov v12.8b, v25.8b 533 mov v25.8b, v24.8b 534 535 uaddl v28.8h, v9.8b, v11.8b 536 uqxtn v13.8b, v16.8h 537 uqxtn v17.8b, v17.8h 538 mov v13.2s[1], v17.2s[0] 539 540 541 uaddl v14.8h, v5.8b, v3.8b 542 uaddl v22.8h, v7.8b, v1.8b 543 mls v20.8h, v18.8h , v24.8h 544 st1 {v12.2s}, [x1], x3 // store row 0 545 mla v14.8h, v28.8h , v26.8h 546 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0] 547 uaddl v30.8h, v10.8b, v0.8b 548 uaddl v28.8h, v6.8b, v4.8b 549 mls v14.8h, v22.8h , v24.8h 550 st1 {v13.2s}, [x1], x3 // store row 1 551 mla v28.8h, v30.8h , v26.8h 552 553 ext v22.16b, v20.16b , v14.16b , #10 554 saddl v30.4s, v20.4h, v22.4h 555 556 saddl2 v22.4s, v20.8h, v22.8h 557 ext v16.16b, v20.16b , v14.16b , #4 558 ext v18.16b, v20.16b , v14.16b , #6 559 ext v12.16b, v20.16b , v14.16b , #8 560 ext v14.16b, v20.16b , v14.16b , #2 561 add v16.8h, v16.8h , v18.8h 562 add v18.8h, v14.8h , v12.8h 563 uaddl v20.8h, v8.8b, v2.8b 564 smlal v30.4s, v16.4h, v26.4h 565 smlsl v30.4s, v18.4h, v24.4h 566 smlal2 v22.4s, v16.8h, v26.8h 567 smlsl2 v22.4s, v18.8h, v24.8h 568 uaddl v18.8h, v11.8b, v1.8b 569 uaddl v16.8h, v7.8b, v5.8b 570 sqrshrun v12.4h, v30.4s, #10 571 uaddl v30.8h, v9.8b, v3.8b 572 mla v16.8h, v18.8h , v26.8h 573 sqrshrun v13.4h, v22.4s, #10 574 mls v28.8h, v20.8h , v24.8h 575 576 mls v16.8h, v30.8h , v24.8h 577 uqxtn v27.8b, v12.8h 578 uqxtn v13.8b, v13.8h 579 mov v27.2s[1], v13.2s[0] 580 581 582 ext v22.16b, v28.16b , v16.16b , #10 583 584 saddl v30.4s, v28.4h, v22.4h 585 586 saddl2 v22.4s, v28.8h, v22.8h 587 ext v12.16b, v28.16b , v16.16b , #4 588 ext v18.16b, v28.16b , v16.16b , #6 589 ext v20.16b, v28.16b , v16.16b , #8 590 ext v28.16b, v28.16b , v16.16b , #2 591 add v12.8h, v12.8h , v18.8h 592 add v18.8h, v28.8h , v20.8h 593 594 smlal v30.4s, v12.4h, v26.4h 595 smlsl v30.4s, v18.4h, v24.4h 596 smlal2 v22.4s, v12.8h, v26.8h 597 smlsl2 v22.4s, v18.8h, v24.8h 598 599 600 mov v12.8b, v27.8b 601 mov v27.8b, v26.8b 602 603 sqrshrun v16.4h, v30.4s, #10 604 605 mov v6.16b, v2.16b 606 mov v7.16b, v3.16b 607 608 sqrshrun v17.4h, v22.4s, #10 609 610 mov v2.16b, v10.16b 611 mov v3.16b, v11.16b 612 613 mov v10.16b, v0.16b 614 mov v11.16b, v1.16b 615 616 subs x4, x4, #4 617 uqxtn v13.8b, v16.8h 618 uqxtn v17.8b, v17.8h 619 mov v13.2s[1], v17.2s[0] 620 621 622 mov v0.16b, v8.16b 623 mov v1.16b, v9.16b 624 625 mov v8.16b, v4.16b 626 mov v9.16b, v5.16b 627 628 mov v4.16b, v10.16b 629 mov v5.16b, v11.16b 630 631 st1 {v12.2s}, [x1], x3 // store row 2 632 st1 {v13.2s}, [x1], x3 // store row 3 633 634 bgt loop_8 //if height =8 loop 635 b end_func 636 637loop_4_start: 638 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] 639 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] 640 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] 641 ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] 642 ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] 643 644loop_4: 645 ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] 646 uaddl v14.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] 647 uaddl v12.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0] 648 uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0] 649 mla v12.8h, v14.8h , v26.8h // temp += temp1 * 20 650 uaddl v18.8h, v5.8b, v7.8b // temp1 = src[2_0] + src[3_0] 651 uaddl v14.8h, v1.8b, v11.8b // temp = src[0_0] + src[5_0] 652 uaddl v22.8h, v3.8b, v9.8b // temp2 = src[1_0] + src[4_0] 653 mla v14.8h, v18.8h , v26.8h // temp += temp1 * 20 654 mls v12.8h, v16.8h , v24.8h // temp -= temp2 * 5 655 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0] 656 uaddl v16.8h, v6.8b, v8.8b 657 mls v14.8h, v22.8h , v24.8h // temp -= temp2 * 5 658 //Q6 and Q7 have filtered values 659 uaddl v28.8h, v2.8b, v0.8b 660 661 ext v22.16b, v12.16b , v14.16b , #10 662 uaddl v18.8h, v4.8b, v10.8b 663 mla v28.8h, v16.8h , v26.8h 664 saddl v30.4s, v12.4h, v22.4h 665 666 saddl v22.4s, v13.4h, v23.4h 667 ext v16.16b, v12.16b , v14.16b , #4 668 mls v28.8h, v18.8h , v24.8h 669 ext v18.16b, v12.16b , v14.16b , #6 670 ext v20.16b, v12.16b , v14.16b , #8 671 ext v14.16b, v12.16b , v14.16b , #2 672 add v16.8h, v16.8h , v18.8h 673 add v18.8h, v14.8h , v20.8h 674 uaddl v20.8h, v7.8b, v9.8b 675 smlal v30.4s, v16.4h, v26.4h 676 smlsl v30.4s, v18.4h, v24.4h 677 smlal v22.4s, v17.4h, v26.4h 678 smlsl v22.4s, v19.4h, v24.4h 679 uaddl v14.8h, v3.8b, v1.8b 680 681 mla v14.8h, v20.8h , v26.8h 682 sqrshrun v12.4h, v30.4s, #10 683 uaddl v16.8h, v5.8b, v11.8b 684 sqrshrun v13.4h, v22.4s, #10 685 mls v14.8h, v16.8h , v24.8h 686 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0] 687 uqxtn v25.8b, v12.8h 688 uaddl v16.8h, v8.8b, v10.8b 689 690 ext v22.16b, v28.16b , v14.16b , #10 691 uaddl v20.8h, v4.8b, v2.8b 692 saddl v30.4s, v28.4h, v22.4h 693 mla v20.8h, v16.8h , v26.8h 694 695 saddl v22.4s, v29.4h, v23.4h 696 ext v16.16b, v28.16b , v14.16b , #4 697 ext v18.16b, v28.16b , v14.16b , #6 698 ext v12.16b, v28.16b , v14.16b , #8 699 ext v14.16b, v28.16b , v14.16b , #2 700 add v16.8h, v16.8h , v18.8h 701 add v18.8h, v12.8h , v14.8h 702 703 smlal v30.4s, v16.4h, v26.4h 704 smlsl v30.4s, v18.4h, v24.4h 705 smlal v22.4s, v17.4h, v26.4h 706 smlsl v22.4s, v19.4h, v24.4h 707 708 709 uaddl v18.8h, v6.8b, v0.8b 710 sqrshrun v16.4h, v30.4s, #10 711 712 sqrshrun v17.4h, v22.4s, #10 713 714 mov v12.8b, v25.8b 715 mov v25.8b, v24.8b 716 717 uaddl v28.8h, v9.8b, v11.8b 718 uqxtn v13.8b, v16.8h 719 720 721 722 uaddl v14.8h, v5.8b, v3.8b 723 uaddl v22.8h, v7.8b, v1.8b 724 mls v20.8h, v18.8h , v24.8h 725 st1 {v12.s}[0], [x1], x3 // store row 0 726 mla v14.8h, v28.8h , v26.8h 727 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0] 728 uaddl v30.8h, v10.8b, v0.8b 729 uaddl v28.8h, v6.8b, v4.8b 730 mls v14.8h, v22.8h , v24.8h 731 st1 {v13.s}[0], [x1], x3 //store row 1 732 mla v28.8h, v30.8h , v26.8h 733 734 ext v22.16b, v20.16b , v14.16b , #10 735 saddl v30.4s, v20.4h, v22.4h 736 737 saddl v22.4s, v21.4h, v23.4h 738 ext v16.16b, v20.16b , v14.16b , #4 739 ext v18.16b, v20.16b , v14.16b , #6 740 ext v12.16b, v20.16b , v14.16b , #8 741 ext v14.16b, v20.16b , v14.16b , #2 742 add v16.8h, v16.8h , v18.8h 743 add v18.8h, v14.8h , v12.8h 744 uaddl v20.8h, v8.8b, v2.8b 745 smlal v30.4s, v16.4h, v26.4h 746 smlsl v30.4s, v18.4h, v24.4h 747 smlal v22.4s, v17.4h, v26.4h 748 smlsl v22.4s, v19.4h, v24.4h 749 uaddl v18.8h, v11.8b, v1.8b 750 uaddl v16.8h, v7.8b, v5.8b 751 sqrshrun v12.4h, v30.4s, #10 752 uaddl v30.8h, v9.8b, v3.8b 753 mla v16.8h, v18.8h , v26.8h 754 sqrshrun v13.4h, v22.4s, #10 755 mls v28.8h, v20.8h , v24.8h 756 757 mls v16.8h, v30.8h , v24.8h 758 uqxtn v27.8b, v12.8h 759 760 ext v22.16b, v28.16b , v16.16b , #10 761 762 saddl v30.4s, v28.4h, v22.4h 763 764 saddl v22.4s, v29.4h, v23.4h 765 ext v12.16b, v28.16b , v16.16b , #4 766 ext v18.16b, v28.16b , v16.16b , #6 767 ext v20.16b, v28.16b , v16.16b , #8 768 ext v28.16b, v28.16b , v16.16b , #2 769 add v12.8h, v12.8h , v18.8h 770 add v18.8h, v28.8h , v20.8h 771 772 smlal v30.4s, v12.4h, v26.4h 773 smlsl v30.4s, v18.4h, v24.4h 774 smlal v22.4s, v13.4h, v26.4h 775 smlsl v22.4s, v19.4h, v24.4h 776 777 778 mov v12.8b, v27.8b 779 mov v27.8b, v26.8b 780 781 sqrshrun v16.4h, v30.4s, #10 782 783 mov v6.16b, v2.16b 784 mov v7.16b, v3.16b 785 786 sqrshrun v17.4h, v22.4s, #10 787 788 mov v2.16b, v10.16b 789 mov v3.16b, v11.16b 790 791 mov v10.16b, v0.16b 792 mov v11.16b, v1.16b 793 794 subs x4, x4, #4 795 uqxtn v13.8b, v16.8h 796 797 mov v0.16b, v8.16b 798 mov v1.16b, v9.16b 799 800 mov v8.16b, v4.16b 801 mov v9.16b, v5.16b 802 803 804 mov v4.16b, v10.16b 805 mov v5.16b, v11.16b 806 807 808 st1 {v12.s}[0], [x1], x3 // store row 2 809 st1 {v13.s}[0], [x1], x3 // store row 3 810 811 bgt loop_4 812 813end_func: 814 //Restoring registers from stack 815 ldp x19, x20, [sp], #16 816 pop_v_regs 817 ret 818 819 820 821