ih264_deblk_chroma_av8.s revision 1b025fff7c9d8bc5692db1a2359ea1c9e4075cd5
1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///*****************************************************************************/ 21///* */ 22///* File Name : ih264_deblk_chroma_av8.s */ 23///* */ 24///* Description : Contains function definitions for deblocking luma */ 25///* edge. Functions are coded in NEON assembly and can */ 26///* be compiled using ARM RVDS. */ 27///* */ 28///* List of Functions : ih264_deblk_chroma_vert_bs4_av8() */ 29///* ih264_deblk_chroma_vert_bslt4_av8() */ 30///* ih264_deblk_chroma_horz_bs4_av8() */ 31///* ih264_deblk_chroma_horz_bslt4_av8() */ 32///* Issues / Problems : None */ 33///* */ 34///* Revision History : */ 35///* */ 36///* DD MM YYYY Author(s) Changes (Describe the changes made) */ 37///* 28 11 2013 Ittiam Draft */ 38///*****************************************************************************/ 39 40 41.text 42.p2align 2 43.include "ih264_neon_macros.s" 44 45///** 46//******************************************************************************* 47//* 48//* @brief 49//* Performs filtering of a chroma block horizontal edge when the 50//* boundary strength is set to 4 in high profile 51//* 52//* @par Description: 53//* This operation is described in Sec. 8.7.2.4 under the title 54//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 55//* 56//* @param[in] x0 - pu1_src 57//* Pointer to the src sample q0 58//* 59//* @param[in] x1 - src_strd 60//* Source stride 61//* 62//* @param[in] x2 - alpha_cb 63//* Alpha Value for the boundary in U 64//* 65//* @param[in] x3 - beta_cb 66//* Beta Value for the boundary in U 67//* 68//* @param[in] sp(0) - alpha_cr 69//* Alpha Value for the boundary in V 70//* 71//* @param[in] sp(4) - beta_cr 72//* Beta Value for the boundary in V 73//* 74//* @returns 75//* None 76//* 77//* @remarks 78//* None 79//* 80//******************************************************************************* 81//*/ 82 83 .global ih264_deblk_chroma_horz_bs4_av8 84 85ih264_deblk_chroma_horz_bs4_av8: 86 87 // STMFD sp!,{x4-x6,x14} // 88 push_v_regs 89 stp x19, x20, [sp, #-16]! 90 mov x6, x5 91 mov x5, x4 92 sub x0, x0, x1, lsl #1 //x0 = uc_edgePixel pointing to p1 of chroma 93 ld2 {v6.8b, v7.8b}, [x0], x1 //D6 = p1u , D7 = p1v 94 mov x4, x0 //Keeping a backup of the pointer p0 of chroma 95 ld2 {v4.8b, v5.8b}, [x0], x1 //D4 = p0u , D5 = p0v 96 dup v20.8b, w2 //D20 contains alpha_cb 97 dup v21.8b, w5 //D21 contains alpha_cr 98 mov v20.d[1], v21.d[0] 99 ld2 {v0.8b, v1.8b}, [x0], x1 //D0 = q0u , D1 = q0v 100 uaddl v8.8h, v6.8b, v0.8b // 101 uaddl v10.8h, v7.8b, v1.8b //Q4,Q5 = q0 + p1 102 movi v31.8b, #2 // 103 ld2 {v2.8b, v3.8b}, [x0] //D2 = q1u , D3 = q1v 104 mov v0.d[1], v1.d[0] 105 mov v2.d[1], v3.d[0] 106 mov v4.d[1], v5.d[0] 107 mov v6.d[1], v7.d[0] 108 uabd v26.16b, v6.16b , v4.16b //Q13 = ABS(p1 - p0) 109 umlal v8.8h, v2.8b, v31.8b // 110 umlal v10.8h, v3.8b, v31.8b //Q5,Q4 = (X2(q1U) + q0U + p1U) 111 uabd v22.16b, v4.16b , v0.16b //Q11 = ABS(p0 - q0) 112 uabd v24.16b, v2.16b , v0.16b //Q12 = ABS(q1 - q0) 113 uaddl v14.8h, v4.8b, v2.8b // 114 uaddl v28.8h, v5.8b, v3.8b //Q14,Q7 = P0 + Q1 115 dup v16.8b, w3 //D16 contains beta_cb 116 dup v17.8b, w6 //D17 contains beta_cr 117 mov v16.d[1], v17.d[0] 118 umlal v14.8h, v6.8b, v31.8b // 119 umlal v28.8h, v7.8b, v31.8b //Q14,Q7 = (X2(p1U) + p0U + q1U) 120 cmhs v18.16b, v22.16b, v20.16b 121 cmhs v24.16b, v24.16b, v16.16b 122 cmhs v26.16b, v26.16b, v16.16b 123 rshrn v8.8b, v8.8h, #2 // 124 rshrn v9.8b, v10.8h, #2 //Q4 = (X2(q1U) + q0U + p1U + 2) >> 2 125 mov v8.d[1], v9.d[0] 126 orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) 127 rshrn v10.8b, v14.8h, #2 // 128 rshrn v11.8b, v28.8h, #2 //Q5 = (X2(p1U) + p0U + q1U + 2) >> 2 129 mov v10.d[1], v11.d[0] 130 orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) 131 bit v10.16b, v4.16b , v18.16b // 132 bit v8.16b, v0.16b , v18.16b // 133 mov v11.d[0], v10.d[1] 134 mov v9.d[0], v8.d[1] 135 st2 {v10.8b, v11.8b}, [x4], x1 // 136 st2 {v8.8b, v9.8b}, [x4] // 137 // LDMFD sp!,{x4-x6,pc} // 138 ldp x19, x20, [sp], #16 139 pop_v_regs 140 ret 141 142 143 144///** 145//******************************************************************************* 146//* 147//* @brief 148//* Performs filtering of a chroma block vertical edge when the 149//* boundary strength is set to 4 in high profile 150//* 151//* @par Description: 152//* This operation is described in Sec. 8.7.2.4 under the title 153//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 154//* 155//* @param[in] x0 - pu1_src 156//* Pointer to the src sample q0 157//* 158//* @param[in] x1 - src_strd 159//* Source stride 160//* 161//* @param[in] x2 - alpha_cb 162//* Alpha Value for the boundary in U 163//* 164//* @param[in] x3 - beta_cb 165//* Beta Value for the boundary in U 166//* 167//* @param[in] sp(0) - alpha_cr 168//* Alpha Value for the boundary in V 169//* 170//* @param[in] sp(4) - beta_cr 171//* Beta Value for the boundary in V 172//* 173//* @returns 174//* None 175//* 176//* @remarks 177//* None 178//* 179//******************************************************************************* 180//*/ 181 182 .global ih264_deblk_chroma_vert_bs4_av8 183 184ih264_deblk_chroma_vert_bs4_av8: 185 186 // STMFD sp!,{x4,x5,x12,x14} 187 push_v_regs 188 stp x19, x20, [sp, #-16]! 189 190 sub x0, x0, #4 //point x0 to p1u of row0. 191 mov x12, x0 //keep a back up of x0 for buffer write 192 193 add x2, x2, x4, lsl #8 //x2 = (alpha_cr,alpha_cb) 194 add x3, x3, x5, lsl #8 //x3 = (beta_cr,beta_cb) 195 196 ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1 197 ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1 198 ld4 {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1 199 ld4 {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1 200 201 ld4 {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1 202 ld4 {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1 203 ld4 {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1 204 ld4 {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1 205 206 mov v10.16b, v2.16b 207 mov v2.16b, v1.16b 208 mov v1.16b, v4.16b 209 mov v4.16b, v10.16b 210 mov v10.16b, v6.16b 211 mov v6.16b, v3.16b 212 mov v3.16b, v5.16b 213 mov v5.16b, v10.16b 214 215 dup v22.8h, w2 //Q11 = alpha 216 dup v24.8h, w3 //Q12 = beta 217 movi v31.8b, #2 218 219 mov v0.d[1], v1.d[0] 220 mov v2.d[1], v3.d[0] 221 mov v4.d[1], v5.d[0] 222 mov v6.d[1], v7.d[0] 223 224 uabd v8.16b, v2.16b , v4.16b //|p0-q0| 225 uabd v10.16b, v6.16b , v4.16b //|q1-q0| 226 uabd v12.16b, v0.16b , v2.16b //|p1-p0| 227 uaddl v14.8h, v2.8b, v6.8b 228 uaddl v16.8h, v3.8b, v7.8b //(p0 + q1) 229 cmhi v8.16b, v22.16b , v8.16b //|p0-q0| < alpha ? 230 cmhi v10.16b, v24.16b , v10.16b //|q1-q0| < beta ? 231 cmhi v12.16b, v24.16b , v12.16b //|p1-p0| < beta ? 232 umlal v14.8h, v0.8b, v31.8b 233 umlal v16.8h, v1.8b, v31.8b //2*p1 + (p0 + q1) 234 uaddl v18.8h, v0.8b, v4.8b 235 uaddl v20.8h, v1.8b, v5.8b //(p1 + q0) 236 and v8.16b, v8.16b , v10.16b //|p0-q0| < alpha && |q1-q0| < beta 237 umlal v18.8h, v6.8b, v31.8b 238 umlal v20.8h, v7.8b, v31.8b //2*q1 + (p1 + q0) 239 240 rshrn v14.8b, v14.8h, #2 241 rshrn v15.8b, v16.8h, #2 //(2*p1 + (p0 + q1) + 2) >> 2 242 mov v14.d[1], v15.d[0] 243 and v8.16b, v8.16b , v12.16b //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta 244 rshrn v18.8b, v18.8h, #2 245 rshrn v19.8b, v20.8h, #2 //(2*q1 + (p1 + q0) + 2) >> 2 246 mov v18.d[1], v19.d[0] 247 bit v2.16b, v14.16b , v8.16b 248 bit v4.16b, v18.16b , v8.16b 249 250 mov v1.d[0], v0.d[1] 251 mov v3.d[0], v2.d[1] 252 mov v5.d[0], v4.d[1] 253 mov v7.d[0], v6.d[1] 254 255 mov v10.16b, v1.16b 256 mov v1.16b, v2.16b 257 mov v2.16b, v4.16b 258 mov v4.16b, v10.16b 259 mov v10.16b, v3.16b 260 mov v3.16b, v6.16b 261 mov v6.16b, v5.16b 262 mov v5.16b, v10.16b 263 264 st4 {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1 265 st4 {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1 266 st4 {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1 267 st4 {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1 268 269 st4 {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1 270 st4 {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1 271 st4 {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1 272 st4 {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1 273 274 // LDMFD sp!,{x4,x5,x12,pc} 275 ldp x19, x20, [sp], #16 276 pop_v_regs 277 ret 278 279 280 281///** 282//******************************************************************************* 283//* 284//* @brief 285//* Performs filtering of a chroma block horizontal edge for cases where the 286//* boundary strength is less than 4 in high profile 287//* 288//* @par Description: 289//* This operation is described in Sec. 8.7.2.4 under the title 290//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 291//* 292//* @param[in] x0 - pu1_src 293//* Pointer to the src sample q0 294//* 295//* @param[in] x1 - src_strd 296//* Source stride 297//* 298//* @param[in] x2 - alpha_cb 299//* Alpha Value for the boundary in U 300//* 301//* @param[in] x3 - beta_cb 302//* Beta Value for the boundary in U 303//* 304//* @param[in] sp(0) - alpha_cr 305//* Alpha Value for the boundary in V 306//* 307//* @param[in] sp(4) - beta_cr 308//* Beta Value for the boundary in V 309//* 310//* @param[in] sp(8) - u4_bs 311//* Packed Boundary strength array 312//* 313//* @param[in] sp(12) - pu1_cliptab_cb 314//* tc0_table for U 315//* 316//* @param[in] sp(16) - pu1_cliptab_cr 317//* tc0_table for V 318//* 319//* @returns 320//* None 321//* 322//* @remarks 323//* None 324//* 325//******************************************************************************* 326//*/ 327 328 .global ih264_deblk_chroma_horz_bslt4_av8 329 330ih264_deblk_chroma_horz_bslt4_av8: 331 332 // STMFD sp!,{x4-x9,x14} // 333 push_v_regs 334 stp x19, x20, [sp, #-16]! 335 mov x8, x7 336 mov x7, x6 337 ldr x9, [sp, #80] 338 sub x0, x0, x1, lsl #1 //x0 = uc_edgePixelU pointing to p1 of chroma U 339 rev w7, w7 // 340 mov v12.s[0], w7 //D12[0] = ui_Bs 341 ld1 {v16.s}[0], [x8] //D16[0] contains cliptab_cb 342 ld1 {v17.s}[0], [x9] //D17[0] contains cliptab_cr 343 ld2 {v6.8b, v7.8b}, [x0], x1 //Q3=p1 344 tbl v14.8b, {v16.16b}, v12.8b //Retreiving cliptab values for U 345 tbl v28.8b, {v17.16b}, v12.8b //Retrieving cliptab values for V 346 uxtl v12.8h, v12.8b //Q6 = uc_Bs in each 16 bit scalar 347 mov x6, x0 //Keeping a backup of the pointer to chroma U P0 348 ld2 {v4.8b, v5.8b}, [x0], x1 //Q2=p0 349 movi v30.8b, #1 // 350 dup v20.8b, w2 //D20 contains alpha_cb 351 dup v21.8b, w4 //D21 contains alpha_cr 352 mov v20.d[1], v21.d[0] 353 ld2 {v0.8b, v1.8b}, [x0], x1 //Q0=q0 354 uxtl v14.8h, v14.8b // 355 uxtl v28.8h, v28.8b // 356 mov v15.d[0], v28.d[0] //D14 has cliptab values for U, D15 for V 357 mov v14.d[1], v28.d[0] 358 ld2 {v2.8b, v3.8b}, [x0] //Q1=q1 359 usubl v10.8h, v1.8b, v5.8b // 360 usubl v8.8h, v0.8b, v4.8b //Q5,Q4 = (q0 - p0) 361 mov v6.d[1], v7.d[0] 362 mov v4.d[1], v5.d[0] 363 uabd v26.16b, v6.16b , v4.16b //Q13 = ABS(p1 - p0) 364 shl v10.8h, v10.8h, #2 //Q5 = (q0 - p0)<<2 365 mov v0.d[1], v1.d[0] 366 uabd v22.16b, v4.16b , v0.16b //Q11 = ABS(p0 - q0) 367 shl v8.8h, v8.8h, #2 //Q4 = (q0 - p0)<<2 368 mov v14.d[1], v15.d[0] 369 sli v14.8h, v14.8h, #8 370 mov v15.d[0], v14.d[1] 371 mov v2.d[1], v3.d[0] 372 uabd v24.16b, v2.16b , v0.16b //Q12 = ABS(q1 - q0) 373 cmhs v18.16b, v22.16b, v20.16b 374 usubl v20.8h, v6.8b, v2.8b //Q10 = (p1 - q1)L 375 usubl v6.8h, v7.8b, v3.8b //Q3 = (p1 - q1)H 376 dup v16.8b, w3 //Q8 contains beta_cb 377 dup v17.8b, w5 //Q8 contains beta_cr 378 mov v16.d[1], v17.d[0] 379 add v8.8h, v8.8h , v20.8h // 380 add v10.8h, v10.8h , v6.8h //Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1) 381 cmhs v24.16b, v24.16b, v16.16b 382 cmgt v12.4h, v12.4h, #0 383 sqrshrn v8.8b, v8.8h, #3 // 384 sqrshrn v9.8b, v10.8h, #3 //Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 385 mov v8.d[1], v9.d[0] 386 add v14.8b, v14.8b , v30.8b //D14 = C = C0+1 for U 387 cmhs v26.16b, v26.16b, v16.16b 388 orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) 389 abs v6.16b, v8.16b //Q4 = ABS (i_macro) 390 add v15.8b, v15.8b , v30.8b //D15 = C = C0+1 for V 391 mov v14.d[1], v15.d[0] 392 mov v13.8b, v12.8b 393 mov v12.d[1], v13.d[0] // 394 orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) 395 umin v14.16b, v6.16b , v14.16b //Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) 396 bic v12.16b, v12.16b , v18.16b //final condition 397 cmge v8.16b, v8.16b, #0 398 and v14.16b, v14.16b , v12.16b //Making delta zero in places where values shouldn be filterd 399 uqadd v16.16b, v4.16b , v14.16b //Q8 = p0 + delta 400 uqsub v4.16b, v4.16b , v14.16b //Q2 = p0 - delta 401 uqadd v18.16b, v0.16b , v14.16b //Q9 = q0 + delta 402 uqsub v0.16b, v0.16b , v14.16b //Q0 = q0 - delta 403 bif v16.16b, v4.16b , v8.16b //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) 404 bif v0.16b, v18.16b , v8.16b //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) 405 mov v17.d[0], v16.d[1] 406 mov v1.d[0], v0.d[1] 407 st2 {v16.8b, v17.8b}, [x6], x1 // 408 st2 {v0.8b, v1.8b}, [x6] // 409 410 ldp x19, x20, [sp], #16 411 pop_v_regs 412 ret 413 414 415 416 417///** 418//******************************************************************************* 419//* 420//* @brief 421//* Performs filtering of a chroma block vertical edge for cases where the 422//* boundary strength is less than 4 in high profile 423//* 424//* @par Description: 425//* This operation is described in Sec. 8.7.2.4 under the title 426//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 427//* 428//* @param[in] x0 - pu1_src 429//* Pointer to the src sample q0 430//* 431//* @param[in] x1 - src_strd 432//* Source stride 433//* 434//* @param[in] x2 - alpha_cb 435//* Alpha Value for the boundary in U 436//* 437//* @param[in] x3 - beta_cb 438//* Beta Value for the boundary in U 439//* 440//* @param[in] sp(0) - alpha_cr 441//* Alpha Value for the boundary in V 442//* 443//* @param[in] sp(4) - beta_cr 444//* Beta Value for the boundary in V 445//* 446//* @param[in] sp(8) - u4_bs 447//* Packed Boundary strength array 448//* 449//* @param[in] sp(12) - pu1_cliptab_cb 450//* tc0_table for U 451//* 452//* @param[in] sp(16) - pu1_cliptab_cr 453//* tc0_table for V 454//* 455//* @returns 456//* None 457//* 458//* @remarks 459//* None 460//* 461//******************************************************************************* 462//*/ 463 464 .global ih264_deblk_chroma_vert_bslt4_av8 465 466ih264_deblk_chroma_vert_bslt4_av8: 467 468 // STMFD sp!,{x4-x7,x10-x12,x14} 469 push_v_regs 470 stp x19, x20, [sp, #-16]! 471 mov x10, x7 472 ldr x11, [sp, #80] //x11 = u4_bs 473 sub x0, x0, #4 //point x0 to p1u of row0. 474 add x2, x2, x4, lsl #8 475 add x3, x3, x5, lsl #8 476 mov x12, x0 //keep a back up of x0 for buffer write 477 ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1 478 ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1 479 ld4 {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1 480 ld4 {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1 481 482 ld4 {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1 483 ld4 {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1 484 ld4 {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1 485 ld4 {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1 486 487 mov v10.16b, v2.16b 488 mov v2.16b, v1.16b 489 mov v1.16b, v4.16b 490 mov v4.16b, v10.16b 491 mov v10.16b, v6.16b 492 mov v6.16b, v3.16b 493 mov v3.16b, v5.16b 494 mov v5.16b, v10.16b 495 dup v22.8h, w2 //Q11 = alpha 496 mov v2.d[1], v3.d[0] 497 mov v4.d[1], v5.d[0] 498 uabd v8.16b, v2.16b , v4.16b //|p0-q0| 499 dup v24.8h, w3 //Q12 = beta 500 mov v25.d[0], v24.d[1] 501 mov v6.d[1], v7.d[0] 502 mov v0.d[1], v1.d[0] 503 uabd v10.16b, v6.16b , v4.16b //|q1-q0| 504 uabd v12.16b, v0.16b , v2.16b //|p1-p0| 505 cmhi v8.16b, v22.16b , v8.16b //|p0-q0| < alpha ? 506 usubl v14.8h, v0.8b, v6.8b 507 cmhi v10.16b, v24.16b , v10.16b //|q1-q0| < beta ? 508 usubl v16.8h, v1.8b, v7.8b //(p1 - q1) 509 cmhi v12.16b, v24.16b , v12.16b //|p1-p0| < beta ? 510 usubl v18.8h, v4.8b, v2.8b 511 and v8.16b, v8.16b , v10.16b //|p0-q0| < alpha && |q1-q0| < beta 512 usubl v20.8h, v5.8b, v3.8b //(q0 - p0) 513 movi v28.8h, #4 514 ld1 {v24.s}[0], [x10] //Load ClipTable for U 515 ld1 {v25.s}[0], [x11] //Load ClipTable for V 516 rev w6, w6 //Blocking strengths 517 and v8.16b, v8.16b , v12.16b //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta 518 mov v10.s[0], w6 519 mla v14.8h, v18.8h , v28.8h 520 mla v16.8h, v20.8h , v28.8h //4*(q0 - p0) + (p1 - q1) 521 uxtl v10.8h, v10.8b 522 sli v10.4h, v10.4h, #8 523 tbl v12.8b, {v24.16b}, v10.8b //tC0 for U 524 tbl v13.8b, {v25.16b}, v10.8b //tC0 for V 525 zip1 v31.8b, v12.8b, v13.8b 526 zip2 v13.8b, v12.8b, v13.8b 527 mov v12.8b, v31.8b 528 mov v12.d[1], v13.d[0] 529 uxtl v10.4s, v10.4h 530 sli v10.4s, v10.4s, #16 531 movi v24.16b, #1 532 add v12.16b, v12.16b , v24.16b //tC0 + 1 533 cmhs v10.16b, v10.16b , v24.16b 534 and v8.16b, v8.16b , v10.16b //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 535 // Q0 - Q3(inputs), 536 // Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), 537 // Q6 (tC) 538 srshr v14.8h, v14.8h, #3 539 srshr v16.8h, v16.8h, #3 //(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) 540 cmgt v18.8h, v14.8h , #0 541 cmgt v20.8h, v16.8h , #0 542 xtn v18.8b, v18.8h 543 xtn v19.8b, v20.8h //Q9 = sign(delta) 544 mov v18.d[1], v19.d[0] 545 abs v14.8h, v14.8h 546 abs v16.8h, v16.8h 547 xtn v14.8b, v14.8h 548 xtn v15.8b, v16.8h 549 mov v14.d[1], v15.d[0] 550 umin v14.16b, v14.16b , v12.16b //Q7 = |delta| 551 uqadd v20.16b, v2.16b , v14.16b //p0+|delta| 552 uqadd v22.16b, v4.16b , v14.16b //q0+|delta| 553 uqsub v24.16b, v2.16b , v14.16b //p0-|delta| 554 uqsub v26.16b, v4.16b , v14.16b //q0-|delta| 555 bit v24.16b, v20.16b , v18.16b //p0 + delta 556 bit v22.16b, v26.16b , v18.16b //q0 - delta 557 bit v2.16b, v24.16b , v8.16b 558 bit v4.16b, v22.16b , v8.16b 559 mov v1.d[0], v0.d[1] 560 mov v3.d[0], v2.d[1] 561 mov v5.d[0], v4.d[1] 562 mov v7.d[0], v6.d[1] 563 mov v10.16b, v1.16b 564 mov v1.16b, v2.16b 565 mov v2.16b, v4.16b 566 mov v4.16b, v10.16b 567 mov v10.16b, v3.16b 568 mov v3.16b, v6.16b 569 mov v6.16b, v5.16b 570 mov v5.16b, v10.16b 571 st4 {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1 572 st4 {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1 573 st4 {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1 574 st4 {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1 575 576 st4 {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1 577 st4 {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1 578 st4 {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1 579 st4 {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1 580 581 ldp x19, x20, [sp], #16 582 pop_v_regs 583 ret 584 585 586