1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19 20@/** 21@****************************************************************************** 22@* @file 23@* ihevc_inter_pred_luma_horz_w16out.s 24@* 25@* @brief 26@* contains function definitions for inter prediction interpolation. 27@* functions are coded using neon intrinsics and can be compiled using 28 29@* rvct 30@* 31@* @author 32@* parthiban v 33@* 34@* @par list of functions: 35@* 36@* - ihevc_inter_pred_luma_horz_w16out() 37@* 38@* @remarks 39@* none 40@* 41@******************************************************************************* 42@*/ 43@/** 44@******************************************************************************* 45@* 46@* @brief 47@* interprediction luma filter for horizontal 16bit output 48@* 49@* @par description: 50@* applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 51@* to the elements pointed by 'pu1_src' and writes to the location pointed 52@* by 'pu1_dst' no downshifting or clipping is done and the output is used 53@* as an input for vertical filtering or weighted prediction assumptions : 54@* the function is optimized considering the fact width is multiple of 4 or 55@* 8. if width is multiple of 4 then height should be multiple of 2, width 8 56@* is optimized further. 57@* 58@* @param[in] pu1_src 59@* uword8 pointer to the source 60@* 61@* @param[out] pi2_dst 62@* word16 pointer to the destination 63@* 64@* @param[in] src_strd 65@* integer source stride 66@* 67@* @param[in] dst_strd 68@* integer destination stride 69@* 70@* @param[in] pi1_coeff 71@* word8 pointer to the filter coefficients 72@* 73@* @param[in] ht 74@* integer height of the array 75@* 76@* @param[in] wd 77@* integer width of the array 78@* 79@* @returns 80@* 81@* @remarks 82@* none 83@* 84@******************************************************************************* 85@*/ 86 87@void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src, 88@ word16 *pi2_dst, 89@ word32 src_strd, 90@ word32 dst_strd, 91@ word8 *pi1_coeff, 92@ word32 ht, 93@ word32 wd 94 95 96@r0 - free 97@r1 - dst_ptr 98@r2 - src_strd 99@r3 - dst_strd 100@r4 - src_ptr2 101@r5 - inner loop counter 102@r6 - dst_ptr2 103@r7 - free 104@r8 - dst_strd2 105@r9 - src_strd1 106@r10 - wd 107@r11 - #1 108@r12 - src_ptr1 109@r14 - loop_counter 110.text 111.align 4 112.syntax unified 113 114 115 116 117.globl ihevc_inter_pred_luma_horz_w16out_a9q 118 119.type ihevc_inter_pred_luma_horz_w16out_a9q, %function 120 121ihevc_inter_pred_luma_horz_w16out_a9q: 122 123 bic r14, #1 @ clearing bit[0], so that it goes back to mode 124 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 125 ldr r4,[sp,#40] @loads pi1_coeff 126 ldr r7,[sp,#44] @loads ht 127 128 129 vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) 130 sub r14,r7,#0 @checks for ht == 0 131 vabs.s8 d2,d0 @vabs_s8(coeff) 132 mov r11,#1 133 @ble end_loops 134 ldr r10,[sp,#48] @loads wd 135 vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0) 136 sub r12,r0,#3 @pu1_src - 3 137 vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1) 138 add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd 139 vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2) 140 rsb r9,r10,r2,lsl #1 @2*src_strd - wd 141 vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3) 142 rsb r8,r10,r3 @dst_strd - wd 143 vdup.8 d28,d2[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4) 144 145 vdup.8 d29,d2[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5) 146 and r7,r14,#1 @calculating ht_residue ht_residue = (ht & 1) 147 vdup.8 d30,d2[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6) 148 sub r14,r14,r7 @decrement height by ht_residue(residue value is calculated outside) 149 vdup.8 d31,d2[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7) 150 151 cmp r7,#1 152 beq odd_height_decision 153 154even_height_decision: 155 mov r7,r1 156 cmp r10,#4 157 ble outer_loop_4 158 159 cmp r10,#24 160 moveq r10,#16 161 addeq r8,#8 162 addeq r9,#8 163 164 cmp r10,#16 165 bge outer_loop_16_branch 166 167 cmp r10,#12 168 addeq r8,#4 169 addeq r9,#4 170outer_loop_8_branch: 171 b outer_loop_8 172 173outer_loop_16_branch: 174 b outer_loop_16 175 176 177odd_height_decision: 178 cmp r10,#24 179 beq outer_loop_8_branch 180 cmp r10,#12 181 beq outer_loop_4 182 b even_height_decision 183 184outer_loop4_residual: 185 sub r12,r0,#3 @pu1_src - 3 186 mov r1,r7 187 add r1,#16 188 mov r10,#4 189 add r12,#8 190 mov r14,#16 191 add r8,#4 192 add r9,#4 193 194outer_loop_4: 195 add r6,r1,r3,lsl #1 @pu1_dst + dst_strd 196 add r4,r12,r2 @pu1_src + src_strd 197 198 subs r5,r10,#0 @checks wd 199 ble end_inner_loop_4 200 201inner_loop_4: 202 vld1.u32 {d0},[r12],r11 @vector load pu1_src 203 vld1.u32 {d1},[r12],r11 204 vld1.u32 {d2},[r12],r11 205 vld1.u32 {d3},[r12],r11 206 vld1.u32 {d4},[r12],r11 207 vld1.u32 {d5},[r12],r11 208 vld1.u32 {d6},[r12],r11 209 vld1.u32 {d7},[r12],r11 210 @add r12,r12,#4 @increment the input pointer 211 sub r12,r12,#4 212 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 213 @vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] 214 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 215 216 @vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] 217 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 218 @vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] 219 @vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] 220 vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd 221 vld1.u32 {d13},[r4],r11 222 vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register 223 vld1.u32 {d14},[r4],r11 224 vzip.32 d1,d13 225 vld1.u32 {d15},[r4],r11 226 vzip.32 d2,d14 227 vld1.u32 {d16},[r4],r11 228 vzip.32 d3,d15 229 vld1.u32 {d17},[r4],r11 230 vzip.32 d4,d16 231 vld1.u32 {d18},[r4],r11 232 vzip.32 d5,d17 233 vld1.u32 {d19},[r4],r11 234 sub r4,r4,#4 235 @ add r4,r4,#4 @increment the input pointer 236 @ vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] 237 @ vext.u8 d15,d12,d13,#3 @vector extract of src[0_3] 238 @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 239 @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5] 240 @ vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 241 @ vext.u8 d19,d12,d13,#7 @vector extract of src[0_7] 242 @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1] 243 244 245 246 247 248 249 250 vzip.32 d6,d18 251 vzip.32 d7,d19 252 253 vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time 254 vmlsl.u8 q4,d0,d24 255 vmlsl.u8 q4,d2,d26 256 vmlal.u8 q4,d3,d27 257 vmlal.u8 q4,d4,d28 258 vmlsl.u8 q4,d5,d29 259 vmlal.u8 q4,d6,d30 260 vmlsl.u8 q4,d7,d31 261 262 @ vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result 263 vst1.64 {d8},[r1]! @store the i iteration result which is in upper part of the register 264 vst1.64 {d9},[r6]! @store the ii iteration result which is in lower part of the register 265 subs r5,r5,#4 @decrement the wd by 4 266 bgt inner_loop_4 267 268end_inner_loop_4: 269 subs r14,r14,#2 @decrement the ht by 4 270 add r12,r12,r9 @increment the input pointer 2*src_strd-wd 271 add r1,r6,r8,lsl #1 @increment the output pointer 2*dst_strd-wd 272 bgt outer_loop_4 273 274 275height_residue_4: 276 277 ldr r7,[sp,#44] @loads ht 278 and r7,r7,#1 @calculating ht_residue ht_residue = (ht & 1) 279 cmp r7,#0 280 @beq end_loops 281 ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp 282 283outer_loop_height_residue_4: 284 285 286 subs r5,r10,#0 @checks wd 287 ble end_inner_loop_height_residue_4 288 289inner_loop_height_residue_4: 290 vld1.u32 {d0},[r12],r11 @vector load pu1_src 291 vld1.u32 {d1},[r12],r11 292 293 294 295 296 297 298 @ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 299 @ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] 300 @ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 301 302 303 304 @add r12,r12,#4 @increment the input pointer 305 @ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] 306 @ vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 307 @ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] 308 @ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] 309 vld1.u32 {d2},[r12],r11 310 vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time 311 vld1.u32 {d3},[r12],r11 312 vmlsl.u8 q4,d0,d24 313 vld1.u32 {d4},[r12],r11 314 vmlsl.u8 q4,d2,d26 315 vld1.u32 {d5},[r12],r11 316 vmlal.u8 q4,d3,d27 317 vld1.u32 {d6},[r12],r11 318 vmlal.u8 q4,d4,d28 319 vld1.u32 {d7},[r12],r11 320 vmlsl.u8 q4,d5,d29 321 sub r12,r12,#4 322 vmlal.u8 q4,d6,d30 323 vmlsl.u8 q4,d7,d31 @store the i iteration result which is in upper part of the register 324 subs r5,r5,#4 @decrement the wd by 4 325 vst1.64 {d8},[r1]! 326 bgt inner_loop_height_residue_4 327 328end_inner_loop_height_residue_4: 329 subs r7,r7,#1 @decrement the ht by 4 330 rsb r9,r10,r2 331 add r12,r12,r9 @increment the input pointer src_strd-wd 332 add r1,r1,r8 @increment the output pointer dst_strd-wd 333 bgt outer_loop_height_residue_4 334 335 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 336 337outer_loop8_residual: 338 sub r12,r0,#3 @pu1_src - 3 339 mov r1,r7 340 mov r14,#32 341 add r1,#32 342 add r12,#16 343 mov r10,#8 344 add r8,#8 345 add r9,#8 346 347outer_loop_8: 348 349 add r6,r1,r3,lsl #1 @pu1_dst + dst_strd 350 add r4,r12,r2 @pu1_src + src_strd 351 subs r5,r10,#0 @checks wd 352 353 ble end_inner_loop_8 354 355inner_loop_8: 356 vld1.u32 {d0},[r12],r11 @vector load pu1_src 357 vld1.u32 {d1},[r12],r11 358 vld1.u32 {d2},[r12],r11 359 vld1.u32 {d3},[r12],r11 360 361 362 363 364 365 @ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 366 @ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] 367 @ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 368 @ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] 369 @ vext.u8 d6,d0,d1,#6 @vector extract of src [0_6] 370 @ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] 371 @ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] 372 @ vext.u8 d14,d12,d13,#2 373 374 @vext.u8 d15,d12,d13,#3 @vector extract of src[0_3] 375 @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 376 @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5] 377 @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 378 @vext.u8 d19,d12,d13,#7 @vector extract of src[0_7] 379 @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1] 380 vld1.u32 {d4},[r12],r11 381 vmull.u8 q4,d1,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 382 vld1.u32 {d5},[r12],r11 383 vmlal.u8 q4,d3,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 384 vld1.u32 {d6},[r12],r11 385 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 386 vld1.u32 {d7},[r12],r11 387 vmlsl.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 388 vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd 389 vmlal.u8 q4,d4,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 390 vld1.u32 {d13},[r4],r11 391 vmlsl.u8 q4,d5,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 392 vld1.u32 {d14},[r4],r11 393 vmlal.u8 q4,d6,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 394 vld1.u32 {d15},[r4],r11 395 vmlsl.u8 q4,d7,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 396 vld1.u32 {d16},[r4],r11 @vector load pu1_src + src_strd 397 398 vmull.u8 q5,d15,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 399 vld1.u32 {d17},[r4],r11 400 vmlsl.u8 q5,d14,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 401 vld1.u32 {d18},[r4],r11 402 vmlal.u8 q5,d16,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 403 vld1.u32 {d19},[r4],r11 @vector load pu1_src + src_strd 404 vmlsl.u8 q5,d17,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 405 @ vqrshrun.s16 d20,q4,#6 @right shift and saturating narrow result 1 406 vmlal.u8 q5,d18,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 407 vmlsl.u8 q5,d19,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 408 vst1.16 {q4},[r1]! @store the result pu1_dst 409 vmlsl.u8 q5,d12,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 410 vmlal.u8 q5,d13,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 411 412 413 414 @ vqrshrun.s16 d8,q5,#6 @right shift and saturating narrow result 2 415 subs r5,r5,#8 @decrement the wd loop 416 vst1.16 {q5},[r6]! @store the result pu1_dst 417 cmp r5,#4 418 bgt inner_loop_8 419 420end_inner_loop_8: 421 subs r14,r14,#2 @decrement the ht loop 422 add r12,r12,r9 @increment the src pointer by 2*src_strd-wd 423 add r1,r6,r8,lsl #1 @increment the dst pointer by 2*dst_strd-wd 424 bgt outer_loop_8 425 426 427 428 429 430 ldr r10,[sp,#48] @loads wd 431 cmp r10,#12 432 433 beq outer_loop4_residual 434 435 ldr r7,[sp,#44] @loads ht 436 and r7,r7,#1 437 cmp r7,#1 438 beq height_residue_4 439 440@end_loops 441 442 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 443 444 445 446 447 448outer_loop_16: 449 str r0, [sp, #-4]! 450 str r7, [sp, #-4]! 451 add r6,r1,r3,lsl #1 @pu1_dst + dst_strd 452 add r4,r12,r2 @pu1_src + src_strd 453 and r0, r12, #31 454 sub r5,r10,#0 @checks wd 455 @ble end_loops1 456 pld [r12, r2, lsl #1] 457 vld1.u32 {q0},[r12],r11 @vector load pu1_src 458 pld [r4, r2, lsl #1] 459 vld1.u32 {q1},[r12],r11 460 vld1.u32 {q2},[r12],r11 461 vld1.u32 {q3},[r12],r11 462 vld1.u32 {q6},[r12],r11 463 vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 464 vld1.u32 {q7},[r12],r11 465 vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 466 vld1.u32 {q8},[r12],r11 467 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 468 vld1.u32 {q9},[r12],r11 469 vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 470 vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 471 vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 472 vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 473 vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 474 475 476inner_loop_16: 477 478 479 subs r5,r5,#16 480 vmull.u8 q10,d3,d25 481 482 add r12,#8 483 vmlsl.u8 q10,d1,d24 484 485 vld1.u32 {q0},[r4],r11 @vector load pu1_src 486 vmlal.u8 q10,d7,d27 487 488 vld1.u32 {q1},[r4],r11 489 vmlsl.u8 q10,d5,d26 490 491 vld1.u32 {q2},[r4],r11 492 vmlal.u8 q10,d13,d28 493 494 vld1.u32 {q3},[r4],r11 495 vmlal.u8 q10,d17,d30 496 497 vld1.u32 {q6},[r4],r11 498 vmlsl.u8 q10,d15,d29 499 500 vld1.u32 {q7},[r4],r11 501 vmlsl.u8 q10,d19,d31 502 503 vld1.u32 {q8},[r4],r11 504 vmull.u8 q5,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 505 506 vld1.u32 {q9},[r4],r11 507 vmlal.u8 q5,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 508 509 add r4,#8 510 vmlsl.u8 q5,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 511 pld [r12, r2, lsl #2] 512 pld [r4, r2, lsl #2] 513 vst1.8 {q4},[r1]! @store the result pu1_dst 514 vmlsl.u8 q5,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 515 516 addeq r12,r12,r9 @increment the src pointer by 2*src_strd-wd 517 vmlal.u8 q5,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 518 519 addeq r4,r12,r2 @pu1_src + src_strd 520 vmlsl.u8 q5,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 521 522@ and r7, r12, #31 523 vmlal.u8 q5,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 524 525 subeq r14,r14,#2 526 vmlsl.u8 q5,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 527 528 @cmp r7, r0 529 vmull.u8 q11,d3,d25 530 531@ pld [r12, r2, lsl #2] 532 vmlsl.u8 q11,d1,d24 533 534 vst1.16 {q10},[r1]! 535 vmlal.u8 q11,d7,d27 536 537@ pld [r4, r2, lsl #2] 538 vmlsl.u8 q11,d5,d26 539 540@ mov r0, r7 541 vmlal.u8 q11,d13,d28 542 543 cmp r14,#0 544 vmlal.u8 q11,d17,d30 545 546 vst1.16 {q5},[r6]! 547 vmlsl.u8 q11,d15,d29 548 549 vmlsl.u8 q11,d19,d31 550 551 beq epilog_16 552 553 vld1.u32 {q0},[r12],r11 @vector load pu1_src 554 vld1.u32 {q1},[r12],r11 555 vld1.u32 {q2},[r12],r11 556 vld1.u32 {q3},[r12],r11 557 vld1.u32 {q6},[r12],r11 558 vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 559 vld1.u32 {q7},[r12],r11 560 vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 561 vld1.u32 {q8},[r12],r11 562 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 563 vld1.u32 {q9},[r12],r11 564 vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 565 vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 566 cmp r5,#0 567 vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 568 moveq r5,r10 569 vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 570 vst1.8 {q11},[r6]! @store the result pu1_dst 571 vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 572 addeq r1,r6,r8,lsl #1 573 addeq r6,r1,r3,lsl #1 @pu1_dst + dst_strd 574 b inner_loop_16 575 576 577epilog_16: 578@ vqrshrun.s16 d11,q11,#6 579 vst1.8 {q11},[r6]! @store the result pu1_dst 580 581 ldr r7, [sp], #4 582 ldr r0, [sp], #4 583 ldr r10,[sp,#48] 584 cmp r10,#24 585 beq outer_loop8_residual 586 add r1,r6,r8,lsl #1 587 ldr r7,[sp,#44] @loads ht 588 and r7,r7,#1 589 cmp r7,#1 590 beq height_residue_4 591 592end_loops1: 593 594 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 595 596 597 598 599 600 601 602 603 604