1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///*****************************************************************************/ 21///* */ 22///* File Name : ih264_deblk_luma_av8.s */ 23///* */ 24///* Description : Contains function definitions for deblocking luma */ 25///* edge. Functions are coded in NEON assembly and can */ 26///* be compiled using ARM RVDS. */ 27///* */ 28///* List of Functions : ih264_deblk_luma_vert_bs4_av8() */ 29///* ih264_deblk_luma_vert_bslt4_av8() */ 30///* ih264_deblk_luma_horz_bs4_av8() */ 31///* ih264_deblk_luma_horz_bslt4_av8() */ 32///* */ 33///* Issues / Problems : None */ 34///* */ 35///* Revision History : */ 36///* */ 37///* DD MM YYYY Author(s) Changes (Describe the changes made) */ 38///* 28 11 2013 Ittiam Draft */ 39///* */ 40///*****************************************************************************/ 41 42 43.text 44.p2align 2 45.include "ih264_neon_macros.s" 46 47 48 49///** 50//******************************************************************************* 51//* 52//* @brief 53//* Performs filtering of a luma block horizontal edge for cases where the 54//* boundary strength is less than 4 55//* 56//* @par Description: 57//* This operation is described in Sec. 8.7.2.4 under the title 58//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 59//* 60//* @param[in] x0 - pu1_src 61//* Pointer to the src sample q0 62//* 63//* @param[in] x1 - src_strd 64//* Source stride 65//* 66//* @param[in] x2 - alpha 67//* Alpha Value for the boundary 68//* 69//* @param[in] x3 - beta 70//* Beta Value for the boundary 71//* 72//* @param[in] sp(0) - u4_bs 73//* Packed Boundary strength array 74//* 75//* @param[in] sp(4) - pu1_cliptab 76//* tc0_table 77//* 78//* @returns 79//* None 80//* 81//* @remarks 82//* None 83//* 84//******************************************************************************* 85//*/ 86 87 .global ih264_deblk_luma_horz_bslt4_av8 88 89ih264_deblk_luma_horz_bslt4_av8: 90 91 // STMFD sp!,{x4-x7,x14} 92 push_v_regs 93 stp x19, x20, [sp, #-16]! 94 95 //LDRD x4,x5,[SP,#0x14] //x4 = ui_Bs , x5 = *puc_ClpTab 96 sub x0, x0, x1, lsl #1 //x1 = uc_Horizonpad 97 sub x0, x0, x1 //x0 pointer to p2 98 rev w4, w4 // 99 ld1 {v10.8b, v11.8b}, [x0], x1 //p2 values are loaded into q5 100 mov v12.s[0], w4 //d12[0] = ui_Bs 101 mov x6, x0 //keeping backup of pointer to p1 102 ld1 {v8.8b, v9.8b}, [x0], x1 //p1 values are loaded into q4 103 mov x7, x0 //keeping backup of pointer to p0 104 ld1 {v6.8b, v7.8b}, [x0], x1 //p0 values are loaded into q3 105 uxtl v12.8h, v12.8b //q6 = uc_Bs in each 16 bt scalar 106 ld1 {v0.8b, v1.8b}, [x0], x1 //q0 values are loaded into q0 107 mov v10.d[1], v11.d[0] 108 mov v8.d[1], v9.d[0] 109 mov v6.d[1], v7.d[0] 110 uabd v26.16b, v8.16b, v6.16b 111 ld1 {v2.8b, v3.8b}, [x0], x1 //q1 values are loaded into q1 112 mov v0.d[1], v1.d[0] 113 mov v2.d[1], v3.d[0] 114 uabd v22.16b, v6.16b, v0.16b 115 ld1 {v16.s}[0], [x5] //D16[0] contains cliptab 116 uabd v24.16b, v2.16b, v0.16b 117 ld1 {v4.8b, v5.8b}, [x0], x1 //q2 values are loaded into q2 118 tbl v14.8b, {v16.16b}, v12.8b // 119 mov v4.d[1], v5.d[0] 120 dup v20.16b, w2 //Q10 contains alpha 121 dup v16.16b, w3 //Q8 contains beta 122 uxtl v12.4s, v12.4h // 123 uxtl v14.4s, v14.4h // 124 uabd v28.16b, v10.16b, v6.16b 125 uabd v30.16b, v4.16b, v0.16b 126 cmgt v12.4s, v12.4s, #0 127 sli v14.4s, v14.4s, #8 128 cmhs v18.16b, v22.16b, v20.16b 129 cmhs v24.16b, v24.16b, v16.16b 130 cmhs v26.16b, v26.16b, v16.16b 131 cmhi v20.16b, v16.16b , v28.16b //Q10=(Ap<Beta) 132 cmhi v22.16b, v16.16b , v30.16b //Q11=(Aq<Beta) 133 sli v14.4s, v14.4s, #16 134 orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) 135 usubl v30.8h, v1.8b, v7.8b // 136 usubl v24.8h, v0.8b, v6.8b //Q15,Q12 = (q0 - p0) 137 orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) 138 usubl v28.8h, v8.8b, v2.8b //Q14 = (p1 - q1)L 139 shl v26.8h, v30.8h, #2 //Q13 = (q0 - p0)<<2 140 shl v24.8h, v24.8h, #2 //Q12 = (q0 - p0)<<2 141 usubl v30.8h, v9.8b, v3.8b //Q15 = (p1 - q1)H 142 bic v12.16b, v12.16b , v18.16b //final condition 143 add v24.8h, v24.8h , v28.8h // 144 add v26.8h, v26.8h , v30.8h //Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1) 145 sub v18.16b, v14.16b , v20.16b //Q9 = C0 + (Ap < Beta) 146 urhadd v16.16b, v6.16b , v0.16b //Q8 = ((p0+q0+1) >> 1) 147 mov v17.d[0], v16.d[1] 148 sqrshrn v24.8b, v24.8h, #3 // 149 sqrshrn v25.8b, v26.8h, #3 //Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 150 mov v24.d[1], v25.d[0] 151 sub v18.16b, v18.16b , v22.16b //Q9 = C0 + (Ap < Beta) + (Aq < Beta) 152 and v20.16b, v20.16b , v12.16b // 153 and v22.16b, v22.16b , v12.16b // 154 abs v26.16b, v24.16b //Q13 = ABS (i_macro) 155 uaddl v28.8h, v17.8b, v11.8b // 156 uaddl v10.8h, v16.8b, v10.8b //Q14,Q5 = p2 + (p0+q0+1)>>1 157 uaddl v30.8h, v17.8b, v5.8b // 158 umin v18.16b, v26.16b , v18.16b //Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) 159 ushll v26.8h, v9.8b, #1 // 160 uaddl v4.8h, v16.8b, v4.8b //Q15,Q2 = q2 + (p0+q0+1)>>1 161 ushll v16.8h, v8.8b, #1 //Q13,Q8 = (p1<<1) 162 and v18.16b, v18.16b , v12.16b //Making delta zero in places where values shouldn be filterd 163 sub v28.8h, v28.8h , v26.8h //Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1) 164 sub v10.8h, v10.8h , v16.8h // 165 ushll v16.8h, v2.8b, #1 // 166 ushll v26.8h, v3.8b, #1 //Q13,Q8 = (q1<<1) 167 sqshrn v29.8b, v28.8h, #1 // 168 sqshrn v28.8b, v10.8h, #1 //Q14 = i_macro_p1 169 mov v28.d[1], v29.d[0] 170 sub v4.8h, v4.8h , v16.8h // 171 sub v30.8h, v30.8h , v26.8h //Q15,Q2 = [q2 + (p0+q0+1)>>1] - (q1<<1) 172 neg v26.16b, v14.16b //Q13 = -C0 173 smin v28.16b, v28.16b , v14.16b //Q14 = min(C0,i_macro_p1) 174 cmge v24.16b, v24.16b, #0 175 sqshrn v31.8b, v30.8h, #1 // 176 sqshrn v30.8b, v4.8h, #1 //Q15 = i_macro_q1 177 mov v30.d[1], v31.d[0] 178 smax v28.16b, v28.16b , v26.16b //Q14 = max( - C0 , min(C0, i_macro_p1) ) 179 uqadd v16.16b, v6.16b , v18.16b //Q8 = p0 + delta 180 uqsub v6.16b, v6.16b , v18.16b //Q3 = p0 - delta 181 smin v30.16b, v30.16b , v14.16b //Q15 = min(C0,i_macro_q1) 182 and v28.16b, v20.16b , v28.16b //condition check Ap<beta 183 uqadd v14.16b, v0.16b , v18.16b //Q7 = q0 + delta 184 uqsub v0.16b, v0.16b , v18.16b //Q0 = q0 - delta 185 smax v30.16b, v30.16b , v26.16b //Q15 = max( - C0 , min(C0, i_macro_q1) ) 186 bif v16.16b, v6.16b , v24.16b //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) 187 bif v0.16b, v14.16b , v24.16b //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) 188 add v28.16b, v28.16b , v8.16b // 189 and v30.16b, v22.16b , v30.16b //condition check Aq<beta 190 st1 {v16.16b}, [x7], x1 //writting back filtered value of p0 191 add v30.16b, v30.16b , v2.16b // 192 st1 {v0.16b}, [x7], x1 //writting back filtered value of q0 193 st1 {v28.16b}, [x6] //writting back filtered value of p1 194 st1 {v30.16b}, [x7], x1 //writting back filtered value of q1 195 196 // LDMFD sp!,{x4-x7,pc} 197 ldp x19, x20, [sp], #16 198 pop_v_regs 199 ret 200 201 202 203///** 204//******************************************************************************* 205//* 206//* @brief 207//* Performs filtering of a luma block horizontal edge when the 208//* boundary strength is set to 4 209//* 210//* @par Description: 211//* This operation is described in Sec. 8.7.2.4 under the title 212//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 213//* 214//* @param[in] x0 - pu1_src 215//* Pointer to the src sample q0 216//* 217//* @param[in] x1 - src_strd 218//* Source stride 219//* 220//* @param[in] x2 - alpha 221//* Alpha Value for the boundary 222//* 223//* @param[in] x3 - beta 224//* Beta Value for the boundary 225//* 226//* @returns 227//* None 228//* 229//* @remarks 230//* None 231//* 232//******************************************************************************* 233//*/ 234 235 .global ih264_deblk_luma_horz_bs4_av8 236 237ih264_deblk_luma_horz_bs4_av8: 238 239 // Back up necessary registers on stack 240 // STMFD sp!,{x12,x14} 241 push_v_regs 242 stp x19, x20, [sp, #-16]! 243 244 // Init 245 dup v0.16b, w2 //duplicate alpha 246 sub x12, x0, x1 //pointer to p0 = q0 - src_strd 247 dup v2.16b, w3 //duplicate beta 248 sub x14, x0, x1, lsl#1 //pointer to p1 = q0 - src_strd*2 249 sub x2, x0, x1, lsl#2 //pointer to p3 = q0 - src_strd*4 250 sub x3, x14, x1 //pointer to p2 = p1 - src_strd 251 252 // Load Data 253 ld1 {v4.8b, v5.8b}, [x0], x1 //load q0 to Q2, q0 = q0 + src_strd 254 ld1 {v6.8b, v7.8b}, [x12] //load p0 to Q3 255 ld1 {v8.8b, v9.8b}, [x0], x1 //load q1 to Q4, q0 = q0 + src_strd 256 ld1 {v10.8b, v11.8b}, [x14] //load p1 to Q5 257 mov v4.d[1] , v5.d[0] 258 mov v6.d[1] , v7.d[0] 259 mov v8.d[1] , v9.d[0] 260 mov v10.d[1] , v11.d[0] 261 262 // Filter Decision 263 uabd v12.16b , v4.16b, v6.16b 264 uabd v14.16b , v8.16b, v4.16b 265 uabd v16.16b , v10.16b, v6.16b 266 cmhs v18.16b, v12.16b , v0.16b //ABS(p0 - q0) >= Alpha 267 cmhs v14.16b, v14.16b , v2.16b //ABS(q1 - q0) >= Beta 268 cmhs v16.16b, v16.16b , v2.16b //ABS(q1 - q0) >= Beta 269 movi v20.16b, #2 270 orr v18.16b, v18.16b , v14.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta 271 ld1 {v14.8b, v15.8b}, [x0], x1 //load q2 to Q7, q0 = q0 + src_strd 272 mov v14.d[1] , v15.d[0] 273 orr v18.16b, v18.16b , v16.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta 274 usra v20.16b, v0.16b, #2 //alpha >>2 +2 275 uabd v22.16b , v14.16b, v4.16b 276 uaddl v24.8h, v4.8b, v6.8b //p0+q0 L 277 uaddl v26.8h, v5.8b, v7.8b //p0+q0 H 278 cmhi v22.16b, v2.16b , v22.16b //Aq < Beta 279 cmhi v20.16b, v20.16b , v12.16b //(ABS(p0 - q0) <((Alpha >>2) + 2)) 280 // Deblock Filtering q0', q1', q2' 281 uaddw v28.8h, v24.8h , v8.8b //p0+q0+q1 L 282 uaddw v30.8h, v26.8h , v9.8b //p0+q0+q1 H 283 and v22.16b, v22.16b , v20.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) 284 // q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE 285 add v16.8h, v28.8h , v28.8h //2*(p0+q0+q1)L 286 add v0.8h, v30.8h , v30.8h //2*(p0+q0+q1)H 287 uaddw v16.8h, v16.8h , v14.8b //2*(p0+q0+q1)+q2 L 288 uaddw v0.8h, v0.8h , v15.8b //2*(p0+q0+q1)+q2 H 289 uaddw v16.8h, v16.8h , v10.8b //2*(p0+q0+q1)+q2 +p1 L 290 uaddw v0.8h, v0.8h , v11.8b //2*(p0+q0+q1)+q2 +p1 H 291 rshrn v12.8b, v16.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0'] 292 rshrn v13.8b, v0.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0'] 293 mov v12.d[1] , v13.d[0] 294 // q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE 295 uaddl v16.8h, v8.8b, v8.8b //2*q1 L 296 uaddl v0.8h, v9.8b, v9.8b //2*q1 H 297 uaddw v16.8h, v16.8h , v4.8b //2*q1+q0 L 298 uaddw v0.8h, v0.8h , v5.8b //2*q1+q0 H 299 uaddw v16.8h, v16.8h , v10.8b //2*q1+q0+p1 L 300 uaddw v0.8h, v0.8h , v11.8b //2*q1+q0+p1 H 301 rshrn v16.8b, v16.8h, #2 //(2*q1+q0+p1+2)>>2 L [q0"] 302 rshrn v17.8b, v0.8h, #2 //(2*q1+q0+p1+2)>>2 H [q0"] 303 mov v16.d[1] , v17.d[0] 304 uaddw v28.8h, v28.8h , v14.8b //p0+q0+q1+q2 L 305 uaddw v30.8h, v30.8h , v15.8b //p0+q0+q1+q2 H 306 ld1 {v0.8b, v1.8b}, [x0], x1 //load q3 to Q0, q0 = q0 + src_strd 307 mov v0.d[1] , v1.d[0] 308 bit v16.16b, v12.16b , v22.16b //choosing between q0' and q0" depending on condn 309 sub x0, x0, x1, lsl #2 //pointer to q0 310 bic v22.16b, v22.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) 311 // && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) 312 rshrn v12.8b, v28.8h, #2 //(p0+q0+q1+q2+2)>>2 L [q1'] 313 rshrn v13.8b, v30.8h, #2 //(p0+q0+q1+q2+2)>>2 H [q1'] 314 mov v12.d[1] , v13.d[0] 315 bif v4.16b, v16.16b , v18.16b //choose q0 or filtered q0 316 mov v5.d[0] , v4.d[1] 317 uaddl v16.8h, v14.8b, v0.8b //q2+q3,L 318 uaddl v0.8h, v15.8b, v1.8b //q2+q3,H 319 add v28.8h, v28.8h , v16.8h //p0+q0+q1+2*q2+q3 L 320 st1 {v4.8b, v5.8b}, [x0], x1 //store q0 321 add v30.8h, v30.8h , v0.8h //p0+q0+q1+2*q2+q3 H 322 add v28.8h, v28.8h , v16.8h //p0+q0+q1+3*q2+2*q3 L 323 add v30.8h, v30.8h , v0.8h //p0+q0+q1+3*q2+2*q3 H 324 rshrn v0.8b, v28.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2'] 325 rshrn v1.8b, v30.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2'] 326 mov v0.d[1] , v1.d[0] 327 ld1 {v30.8b, v31.8b}, [x3] //load p2 to Q15 328 mov v30.d[1] , v31.d[0] 329 bif v12.16b, v8.16b , v22.16b //choose q1 or filtered value of q1 330 mov v13.d[0] , v12.d[1] 331 uabd v16.16b , v30.16b, v6.16b 332 uaddw v24.8h, v24.8h , v10.8b //p0+q0+p1 L 333 bif v0.16b, v14.16b , v22.16b //choose q2 or filtered q2 334 mov v1.d[0] , v0.d[1] 335 uaddw v26.8h, v26.8h , v11.8b //p0+q0+p1 H 336 st1 {v12.8b, v13.8b}, [x0], x1 //store q1 337 cmhi v16.16b, v2.16b , v16.16b //Ap < Beta 338 add v28.8h, v24.8h , v24.8h //2*(p0+q0+p1) L 339 add v4.8h, v26.8h , v26.8h //2*(p0+q0+p1) H 340 st1 {v0.8b, v1.8b}, [x0], x1 //store q2 341 and v20.16b, v20.16b , v16.16b //((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2))) 342 uaddw v28.8h, v28.8h , v30.8b //2*(p0+q0+p1)+p2 l 343 uaddw v4.8h, v4.8h , v31.8b //2*(p0+q0+p1)+p2 H 344 uaddw v28.8h, v28.8h , v8.8b //2*(p0+q0+p1)+p2+q1 L 345 uaddw v4.8h, v4.8h , v9.8b //2*(p0+q0+p1)+p2+q1 H 346 rshrn v28.8b, v28.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 L,p0' 347 rshrn v29.8b, v4.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 H,p0' 348 mov v28.d[1] , v29.d[0] 349 movi v0.8b, #2 350 movi v1.4h, #2 351 uaddl v2.8h, v6.8b, v8.8b //p0+q1 L 352 umlal v2.8h, v10.8b, v0.8b //2*p1+p0+q1 L 353 uaddl v16.8h, v7.8b, v9.8b //p0+q1 H 354 umlal v16.8h, v11.8b, v0.8b //2*p1+p0+q1 H 355 uaddw v12.8h, v24.8h , v30.8b //(p0+q0+p1) +p2 L 356 ld1 {v24.8b, v25.8b}, [x2] //load p3,Q12 357 mov v24.d[1] , v25.d[0] 358 uaddw v4.8h, v26.8h , v31.8b //(p0+q0+p1) +p2 H 359 uaddl v8.8h, v30.8b, v24.8b //p2+p3 L 360 rshrn v26.8b, v12.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' L 361 rshrn v2.8b, v2.8h, #2 //(2*p1+p0+q1+2)>>2,p0"L 362 rshrn v27.8b, v4.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' H 363 rshrn v3.8b, v16.8h, #2 //(2*p1+p0+q1+2)>>2,p0" H 364 mov v26.d[1] , v27.d[0] 365 mov v2.d[1] , v3.d[0] 366 uaddl v16.8h, v31.8b, v25.8b //p2+p3 H 367 mla v12.8h, v8.8h , v1.h[0] //(p0+q0+p1)+3*p2+2*p3 L 368 mla v4.8h, v16.8h , v1.h[0] //(p0+q0+p1)+3*p2+2*p3 H 369 bic v16.16b, v20.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) 370 mov v17.d[0] , v16.d[1] //&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) 371 bit v2.16b, v28.16b , v20.16b //choosing between po' and p0" 372 mov v3.d[0] , v2.d[1] 373 rshrn v12.8b, v12.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2' 374 rshrn v13.8b, v4.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2' 375 mov v12.d[1] , v13.d[0] 376 bif v6.16b, v2.16b , v18.16b //choosing between p0 and filtered value of p0 377 bit v10.16b, v26.16b , v16.16b //choosing between p1 and p1' 378 bit v30.16b, v12.16b , v16.16b //choosing between p2 and p2' 379 st1 {v6.16b}, [x12] //store p0 380 st1 {v10.16b}, [x14] //store p1 381 st1 {v30.16b}, [x3] //store p2 382 383 // LDMFD sp!,{x12,pc} 384 ldp x19, x20, [sp], #16 385 pop_v_regs 386 ret 387 388 389 390///** 391//******************************************************************************* 392//* 393//* @brief 394//* Performs filtering of a luma block vertical edge for cases where the 395//* boundary strength is less than 4 396//* 397//* @par Description: 398//* This operation is described in Sec. 8.7.2.4 under the title 399//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 400//* 401//* @param[in] x0 - pu1_src 402//* Pointer to the src sample q0 403//* 404//* @param[in] x1 - src_strd 405//* Source stride 406//* 407//* @param[in] x2 - alpha 408//* Alpha Value for the boundary 409//* 410//* @param[in] x3 - beta 411//* Beta Value for the boundary 412//* 413//* @param[in] sp(0) - u4_bs 414//* Packed Boundary strength array 415//* 416//* @param[in] sp(4) - pu1_cliptab 417//* tc0_table 418//* 419//* @returns 420//* None 421//* 422//* @remarks 423//* None 424//* 425//******************************************************************************* 426//*/ 427 428 .global ih264_deblk_luma_vert_bslt4_av8 429 430ih264_deblk_luma_vert_bslt4_av8: 431 432 // STMFD sp!,{x12,x14} 433 push_v_regs 434 stp x19, x20, [sp, #-16]! 435 436 sub x0, x0, #4 //pointer uc_edgePixel-4 437 mov x12, x4 438 mov x14, x5 439 mov x17, x0 440 //loading p3:p2:p1:p0:q0:q1:q2:q3 for every row 441 ld1 {v0.8b}, [x0], x1 //row1 442 ld1 {v2.8b}, [x0], x1 //row2 443 ld1 {v4.8b}, [x0], x1 //row3 444 rev w12, w12 //reversing ui_bs 445 ld1 {v6.8b}, [x0], x1 //row4 446 mov v18.s[0], w12 //d12[0] = ui_Bs 447 ld1 {v16.s}[0], [x14] //D16[0] contains cliptab 448 ld1 {v8.8b}, [x0], x1 //row5 449 uxtl v18.8h, v18.8b //q6 = uc_Bs in each 16 bt scalar 450 ld1 {v10.8b}, [x0], x1 //row6 451 ld1 {v12.8b}, [x0], x1 //row7 452 tbl v16.8b, {v16.16b}, v18.8b //puc_ClipTab[uc_Bs] 453 ld1 {v14.8b}, [x0], x1 //row8 454 ld1 {v1.8b}, [x0], x1 //row9 455 uxtl v16.4s, v16.4h // 456 ld1 {v3.8b}, [x0], x1 //row10 457 ld1 {v5.8b}, [x0], x1 //row11 458 ld1 {v7.8b}, [x0], x1 //row12 459 sli v16.4s, v16.4s, #8 // 460 ld1 {v9.8b}, [x0], x1 //row13 461 ld1 {v11.8b}, [x0], x1 //row14 462 ld1 {v13.8b}, [x0], x1 //row15 463 sli v16.4s, v16.4s, #16 464 ld1 {v15.8b}, [x0], x1 //row16 465 466 467 //taking two 8x8 transposes 468 //2X2 transposes 469 trn1 v21.8b, v0.8b, v2.8b 470 trn2 v2.8b, v0.8b, v2.8b //row1 &2 471 mov v0.8b, v21.8b 472 trn1 v21.8b, v4.8b, v6.8b 473 trn2 v6.8b, v4.8b, v6.8b //row3&row4 474 mov v4.8b, v21.8b 475 trn1 v21.8b, v8.8b, v10.8b 476 trn2 v10.8b, v8.8b, v10.8b //row5&6 477 mov v8.8b, v21.8b 478 trn1 v21.8b, v12.8b, v14.8b 479 trn2 v14.8b, v12.8b, v14.8b //row7 & 8 480 mov v12.8b, v21.8b 481 trn1 v21.8b, v1.8b, v3.8b 482 trn2 v3.8b, v1.8b, v3.8b //row9 &10 483 mov v1.8b, v21.8b 484 trn1 v21.8b, v5.8b, v7.8b 485 trn2 v7.8b, v5.8b, v7.8b //row11 & 12 486 mov v5.8b, v21.8b 487 trn1 v21.8b, v9.8b, v11.8b 488 trn2 v11.8b, v9.8b, v11.8b //row13 &14 489 mov v9.8b, v21.8b 490 trn1 v21.8b, v13.8b, v15.8b 491 trn2 v15.8b, v13.8b, v15.8b //row15 & 16 492 mov v13.8b, v21.8b 493 //4x4 transposes 494 trn1 v21.4h, v2.4h, v6.4h 495 trn2 v6.4h, v2.4h, v6.4h //row2 & row4 496 mov v2.8b, v21.8b 497 trn1 v21.4h, v10.4h, v14.4h 498 trn2 v14.4h, v10.4h, v14.4h //row6 & row8 499 mov v10.8b, v21.8b 500 trn1 v21.4h, v3.4h, v7.4h 501 trn2 v7.4h, v3.4h, v7.4h //row10 & 12 502 mov v3.8b, v21.8b 503 trn1 v21.4h, v11.4h, v15.4h 504 trn2 v15.4h, v11.4h, v15.4h //row14 & row16 505 mov v11.8b, v21.8b 506 trn1 v21.2s, v6.2s, v14.2s 507 trn2 v14.2s, v6.2s, v14.2s //row4 & 8 508 mov v6.8b, v21.8b 509 trn1 v21.2s, v7.2s, v15.2s 510 trn2 v15.2s, v7.2s, v15.2s //row 12 & 16 511 mov v7.8b, v21.8b 512 //now Q3 ->p0 and Q7->q3 513 trn1 v21.4h, v0.4h, v4.4h 514 trn2 v4.4h, v0.4h, v4.4h //row1 & 3 515 mov v0.8b, v21.8b 516 trn1 v21.4h, v8.4h, v12.4h 517 trn2 v12.4h, v8.4h, v12.4h //row 5 & 7 518 mov v8.8b, v21.8b 519 trn1 v21.4h, v1.4h, v5.4h 520 trn2 v5.4h, v1.4h, v5.4h //row9 & row11 521 mov v1.8b, v21.8b 522 trn1 v21.4h, v9.4h, v13.4h 523 trn2 v13.4h, v9.4h, v13.4h //row13 & row15 524 mov v9.8b, v21.8b 525 trn1 v21.2s, v0.2s, v8.2s 526 trn2 v8.2s, v0.2s, v8.2s //row1 & row5 527 mov v0.8b, v21.8b 528 trn1 v21.2s, v1.2s, v9.2s 529 trn2 v9.2s, v1.2s, v9.2s //row9 & 13 530 mov v1.8b, v21.8b 531 //now Q0->p3 & Q4->q0 532 //starting processing as p0 and q0 are now ready 533 trn1 v21.2s, v2.2s, v10.2s 534 trn2 v10.2s, v2.2s, v10.2s //row2 &6 535 mov v2.8b, v21.8b 536 mov v6.d[1] , v7.d[0] 537 mov v8.d[1] , v9.d[0] 538 urhadd v20.16b, v6.16b , v8.16b //((p0 + q0 + 1) >> 1) 539 mov v21.d[0], v20.d[1] 540 trn1 v31.2s, v3.2s, v11.2s 541 trn2 v11.2s, v3.2s, v11.2s //row10&row14 542 mov v3.8b, v31.8b 543 movi v19.8b, #2 544 mov v18.d[1], v19.d[0] 545 //now Q1->p2 & Q5->q1 546 trn1 v31.2s, v4.2s, v12.2s 547 trn2 v12.2s, v4.2s, v12.2s //row3 & 7 548 mov v4.8b, v31.8b 549 uabd v22.16b , v6.16b, v8.16b //ABS(q1 - q0) 550 trn1 v31.2s, v5.2s, v13.2s 551 trn2 v13.2s, v5.2s, v13.2s //row11 & row15 552 mov v5.8b, v31.8b 553 mov v0.d[1] , v1.d[0] 554 mov v2.d[1] , v3.d[0] 555 mov v4.d[1] , v5.d[0] 556 mov v10.d[1] , v11.d[0] 557 mov v12.d[1] , v13.d[0] 558 mov v14.d[1] , v15.d[0] 559 uaddl v24.8h, v20.8b, v2.8b //(p2 + ((p0 + q0 + 1) >> 1) L 560 //now Q2->p1,Q6->q2 561 uaddl v26.8h, v21.8b, v3.8b //(p2 + ((p0 + q0 + 1) >> 1) H 562 umlsl v24.8h, v4.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L 563 umlsl v26.8h, v5.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H 564 dup v28.16b, w2 //alpha 565 cmhs v22.16b, v22.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) 566 dup v28.16b, w3 //beta 567 uabd v30.16b , v10.16b, v8.16b //ABS(q1 - q0) 568 sqshrn v24.8b, v24.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L 569 sqshrn v25.8b, v26.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H 570 mov v24.d[1], v25.d[0] 571 cmhs v30.16b, v30.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) 572 uabd v26.16b , v4.16b, v6.16b //ABS(q1 - q0) 573 574 smin v24.16b, v24.16b , v16.16b //min(deltap1 ,C0) 575 orr v22.16b, v22.16b , v30.16b //ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha 576 neg v30.16b, v16.16b //-C0 577 cmhs v26.16b, v26.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) 578 smax v24.16b, v24.16b , v30.16b //max(deltap1,-C0) 579 orr v22.16b, v22.16b , v26.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta) 580 uxtl v26.4s, v18.4h //ui_bs 581 uaddl v18.8h, v20.8b, v12.8b //q2 + ((p0 + q0 + 1) >> 1) L 582 cmeq v26.4s, v26.4s , #0 //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) 583 usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) L 584 uaddl v20.8h, v21.8b, v13.8b //q2 + ((p0 + q0 + 1) >> 1) H 585 usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L 586 usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) H 587 orr v26.16b, v26.16b , v22.16b //(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs) 588 usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H 589 sqshrn v18.8b, v18.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L 590 uabd v22.16b , v2.16b, v6.16b //ABS(q1 - q0) 591 sqshrn v19.8b, v20.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H 592 mov v18.d[1], v19.d[0] 593 uabd v20.16b , v12.16b, v8.16b //ABS(q1 - q0) 594 cmhi v22.16b, v28.16b , v22.16b //Ap < Beta 595 smin v18.16b, v18.16b , v16.16b //min(delatq1,C0) 596 cmhi v20.16b, v28.16b , v20.16b //Aq <Beta 597 usubl v28.8h, v8.8b, v6.8b //(q0 - p0) L 598 smax v18.16b, v18.16b , v30.16b //max(deltaq1,-C0) 599 usubl v30.8h, v9.8b, v7.8b //(q0 - p0) H 600 shl v28.8h, v28.8h, #2 //(q0 - p0)<<2 L 601 sub v16.16b, v16.16b , v22.16b //C0 + (Ap < Beta) 602 shl v30.8h, v30.8h, #2 //(q0 - p0) << 2) H 603 uaddw v28.8h, v28.8h , v4.8b //((q0 - p0) << 2) + (p1 L 604 uaddw v30.8h, v30.8h , v5.8b //((q0 - p0) << 2) + (p1 H 605 usubw v28.8h, v28.8h , v10.8b //((q0 - p0) << 2) + (p1 - q1) L 606 usubw v30.8h, v30.8h , v11.8b //((q0 - p0) << 2) + (p1 - q1) H 607 bic v22.16b, v22.16b , v26.16b //final condition for p1 608 rshrn v28.8b, v28.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L 609 rshrn v29.8b, v30.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H 610 mov v28.d[1], v29.d[0] 611 sub v16.16b, v16.16b , v20.16b //C0 + (Ap < Beta) + (Aq < Beta) 612 bic v20.16b, v20.16b , v26.16b //final condition for q1 613 abs v30.16b, v28.16b //abs(delta) 614 and v24.16b, v24.16b , v22.16b //delatp1 615 and v18.16b, v18.16b , v20.16b //delta q1 616 umin v30.16b, v30.16b , v16.16b //min((abs(delta),C) 617 add v4.16b, v4.16b , v24.16b //p1+deltap1 618 add v10.16b, v10.16b , v18.16b //q1+deltaq1 619 mov v5.d[0], v4.d[1] 620 mov v11.d[0], v10.d[1] 621 bic v30.16b, v30.16b , v26.16b //abs(delta) of pixels to be changed only 622 // VCGE.S8 Q14, Q14,#0 //sign(delta) 623 cmge v28.16b, v28.16b , #0 624 uqsub v22.16b, v6.16b , v30.16b //clip(p0-delta) 625 626 trn1 v21.8b, v0.8b, v2.8b 627 trn2 v2.8b, v0.8b, v2.8b //row1 &2 628 mov v0.8b, v21.8b 629 uqadd v6.16b, v6.16b , v30.16b //clip(p0+delta) 630 631 trn1 v21.8b, v1.8b, v3.8b 632 trn2 v3.8b, v1.8b, v3.8b //row9 &10 633 mov v1.8b, v21.8b 634 uqadd v24.16b, v8.16b , v30.16b //clip(q0+delta) 635 trn1 v21.8b, v12.8b, v14.8b 636 trn2 v14.8b, v12.8b, v14.8b //row7 & 8 637 mov v12.8b, v21.8b 638 uqsub v8.16b, v8.16b , v30.16b //clip(q0-delta) 639 trn1 v21.8b, v13.8b, v15.8b 640 trn2 v15.8b, v13.8b, v15.8b //row15 & 16 641 mov v13.8b, v21.8b 642 bif v6.16b, v22.16b , v28.16b //p0 643 bif v8.16b, v24.16b , v28.16b //q0 644 mov v7.d[0], v6.d[1] 645 mov v9.d[0], v8.d[1] 646 trn1 v21.8b, v4.8b, v6.8b 647 trn2 v6.8b, v4.8b, v6.8b //row3&row4 648 mov v4.8b, v21.8b 649 trn1 v21.8b, v8.8b, v10.8b 650 trn2 v10.8b, v8.8b, v10.8b //row5&6 651 mov v8.8b, v21.8b 652 trn1 v21.8b, v5.8b, v7.8b 653 trn2 v7.8b, v5.8b, v7.8b //row11 & 12 654 mov v5.8b, v21.8b 655 trn1 v21.8b, v9.8b, v11.8b 656 trn2 v11.8b, v9.8b, v11.8b //row13 &14 657 mov v9.8b, v21.8b 658 trn1 v21.4h, v2.4h, v6.4h 659 trn2 v6.4h, v2.4h, v6.4h //row2 & row4 660 mov v2.8b, v21.8b 661 trn1 v21.4h, v10.4h, v14.4h 662 trn2 v14.4h, v10.4h, v14.4h //row6 & row8 663 mov v10.8b, v21.8b 664 trn1 v21.4h, v3.4h, v7.4h 665 trn2 v7.4h, v3.4h, v7.4h //row10 & 12 666 mov v3.8b, v21.8b 667 trn1 v21.4h, v11.4h, v15.4h 668 trn2 v15.4h, v11.4h, v15.4h //row14 & row16 669 mov v11.8b, v21.8b 670 trn1 v21.2s, v6.2s, v14.2s 671 trn2 v14.2s, v6.2s, v14.2s //row4 & 8 672 mov v6.8b, v21.8b 673 trn1 v21.2s, v7.2s, v15.2s 674 trn2 v15.2s, v7.2s, v15.2s //row 12 & 16 675 mov v7.8b, v21.8b 676 //now Q3 ->p0 and Q7->q3 677 trn1 v21.4h, v0.4h, v4.4h 678 trn2 v4.4h, v0.4h, v4.4h //row1 & 3 679 mov v0.8b, v21.8b 680 trn1 v21.4h, v8.4h, v12.4h 681 trn2 v12.4h, v8.4h, v12.4h //row 5 & 7 682 mov v8.8b, v21.8b 683 trn1 v21.4h, v1.4h, v5.4h 684 trn2 v5.4h, v1.4h, v5.4h //row9 & row11 685 mov v1.8b, v21.8b 686 trn1 v21.4h, v9.4h, v13.4h 687 trn2 v13.4h, v9.4h, v13.4h //row13 & row15 688 mov v9.8b, v21.8b 689 sub x0, x0, x1, lsl#4 //restore pointer 690 trn1 v21.2s, v0.2s, v8.2s 691 trn2 v8.2s, v0.2s, v8.2s //row1 & row5 692 mov v0.8b, v21.8b 693 trn1 v21.2s, v1.2s, v9.2s 694 trn2 v9.2s, v1.2s, v9.2s //row9 & 13 695 mov v1.8b, v21.8b 696 trn1 v21.2s, v2.2s, v10.2s 697 trn2 v10.2s, v2.2s, v10.2s //row2 &6 698 mov v2.8b, v21.8b 699 trn1 v21.2s, v3.2s, v11.2s 700 trn2 v11.2s, v3.2s, v11.2s //row10&row14 701 mov v3.8b, v21.8b 702 trn1 v21.2s, v4.2s, v12.2s 703 trn2 v12.2s, v4.2s, v12.2s //row3 & 7 704 mov v4.8b, v21.8b 705 trn1 v21.2s, v5.2s, v13.2s 706 trn2 v13.2s, v5.2s, v13.2s //row11 & row15 707 mov v5.8b, v21.8b 708 st1 {v0.8b}, [x0], x1 //row1 709 st1 {v2.8b}, [x0], x1 //row2 710 st1 {v4.8b}, [x0], x1 //row3 711 st1 {v6.8b}, [x0], x1 //row4 712 st1 {v8.8b}, [x0], x1 //row5 713 st1 {v10.8b}, [x0], x1 //row6 714 st1 {v12.8b}, [x0], x1 //row7 715 st1 {v14.8b}, [x0], x1 //row8 716 st1 {v1.8b}, [x0], x1 //row9 717 st1 {v3.8b}, [x0], x1 //row10 718 st1 {v5.8b}, [x0], x1 //row11 719 st1 {v7.8b}, [x0], x1 //row12 720 st1 {v9.8b}, [x0], x1 //row13 721 st1 {v11.8b}, [x0], x1 //row14 722 st1 {v13.8b}, [x0], x1 //row15 723 st1 {v15.8b}, [x0], x1 //row16 724 725 // LDMFD sp!,{x12,pc} 726 ldp x19, x20, [sp], #16 727 pop_v_regs 728 ret 729 730 731 732///** 733//******************************************************************************* 734//* 735//* @brief 736//* Performs filtering of a luma block vertical edge when the 737//* boundary strength is set to 4 738//* 739//* @par Description: 740//* This operation is described in Sec. 8.7.2.4 under the title 741//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 742//* 743//* @param[in] x0 - pu1_src 744//* Pointer to the src sample q0 745//* 746//* @param[in] x1 - src_strd 747//* Source stride 748//* 749//* @param[in] x2 - alpha 750//* Alpha Value for the boundary 751//* 752//* @param[in] x3 - beta 753//* Beta Value for the boundary 754//* 755//* @returns 756//* None 757//* 758//* @remarks 759//* None 760//* 761//******************************************************************************* 762//*/ 763 764 .global ih264_deblk_luma_vert_bs4_av8 765 766ih264_deblk_luma_vert_bs4_av8: 767 768 // STMFD sp!,{x12,x14} 769 push_v_regs 770 stp x19, x20, [sp, #-16]! 771 772 sub x0, x0, #4 //pointer uc_edgePixel-4 773 mov x17, x0 774 //loading p3:p2:p1:p0:q0:q1:q2:q3 for every row 775 ld1 {v0.8b}, [x0], x1 //row1 776 ld1 {v2.8b}, [x0], x1 //row2 777 ld1 {v4.8b}, [x0], x1 //row3 778 ld1 {v6.8b}, [x0], x1 //row4 779 ld1 {v8.8b}, [x0], x1 //row5 780 ld1 {v10.8b}, [x0], x1 //row6 781 ld1 {v12.8b}, [x0], x1 //row7 782 ld1 {v14.8b}, [x0], x1 //row8 783 ld1 {v1.8b}, [x0], x1 //row9 784 ld1 {v3.8b}, [x0], x1 //row10 785 ld1 {v5.8b}, [x0], x1 //row11 786 ld1 {v7.8b}, [x0], x1 //row12 787 ld1 {v9.8b}, [x0], x1 //row13 788 ld1 {v11.8b}, [x0], x1 //row14 789 ld1 {v13.8b}, [x0], x1 //row15 790 ld1 {v15.8b}, [x0], x1 //row16 791 792 //taking two 8x8 transposes 793 //2X2 transposes 794 trn1 v21.8b, v0.8b, v2.8b 795 trn2 v2.8b, v0.8b, v2.8b //row1 &2 796 mov v0.8b, v21.8b 797 trn1 v21.8b, v4.8b, v6.8b 798 trn2 v6.8b, v4.8b, v6.8b //row3&row4 799 mov v4.8b, v21.8b 800 trn1 v21.8b, v8.8b, v10.8b 801 trn2 v10.8b, v8.8b, v10.8b //row5&6 802 mov v8.8b, v21.8b 803 trn1 v21.8b, v12.8b, v14.8b 804 trn2 v14.8b, v12.8b, v14.8b //row7 & 8 805 mov v12.8b, v21.8b 806 trn1 v21.8b, v1.8b, v3.8b 807 trn2 v3.8b, v1.8b, v3.8b //row9 &10 808 mov v1.8b , v21.8b 809 trn1 v21.8b, v5.8b, v7.8b 810 trn2 v7.8b, v5.8b, v7.8b //row11 & 12 811 mov v5.8b , v21.8b 812 trn1 v21.8b, v9.8b, v11.8b 813 trn2 v11.8b, v9.8b, v11.8b //row13 &14 814 mov v9.8b , v21.8b 815 trn1 v21.8b, v13.8b, v15.8b 816 trn2 v15.8b, v13.8b, v15.8b //row15 & 16 817 mov v13.8b , v21.8b 818 //4x4 transposes 819 trn1 v21.4h, v2.4h, v6.4h 820 trn2 v6.4h, v2.4h, v6.4h //row2 & row4 821 mov v2.8b, v21.8b 822 trn1 v21.4h, v10.4h, v14.4h 823 trn2 v14.4h, v10.4h, v14.4h //row6 & row8 824 mov v10.8b , v21.8b 825 trn1 v21.4h, v3.4h, v7.4h 826 trn2 v7.4h, v3.4h, v7.4h //row10 & 12 827 mov v3.8b, v21.8b 828 trn1 v21.4h, v11.4h, v15.4h 829 trn2 v15.4h, v11.4h, v15.4h //row14 & row16 830 mov v11.8b, v21.8b 831 trn1 v21.2s, v6.2s, v14.2s 832 trn2 v14.2s, v6.2s, v14.2s //row4 & 8 833 mov v6.8b, v21.8b 834 trn1 v21.2s, v7.2s, v15.2s 835 trn2 v15.2s, v7.2s, v15.2s //row 12 & 16 836 mov v7.8b, v21.8b 837 //now Q3 ->p0 and Q7->q3 838 trn1 v21.4h, v0.4h, v4.4h 839 trn2 v4.4h, v0.4h, v4.4h //row1 & 3 840 mov v0.8b , v21.8b 841 trn1 v21.4h, v8.4h, v12.4h 842 trn2 v12.4h, v8.4h, v12.4h //row 5 & 7 843 mov v8.8b, v21.8b 844 trn1 v21.4h, v1.4h, v5.4h 845 trn2 v5.4h, v1.4h, v5.4h //row9 & row11 846 mov v1.8b, v21.8b 847 trn1 v21.4h, v9.4h, v13.4h 848 trn2 v13.4h, v9.4h, v13.4h //row13 & row15 849 mov v9.8b , v21.8b 850 trn1 v21.2s, v0.2s, v8.2s 851 trn2 v8.2s, v0.2s, v8.2s //row1 & row5 852 mov v0.8b, v21.8b 853 trn1 v21.2s, v1.2s, v9.2s 854 trn2 v9.2s, v1.2s, v9.2s //row9 & 13 855 mov v1.8b, v21.8b 856 //now Q0->p3 & Q4->q0 857 //starting processing as p0 and q0 are now ready 858 //now Q1->p2 & Q5->q1 859 mov v31.d[0], v14.d[0] 860 mov v31.d[1], v15.d[0] 861 trn1 v21.2s, v4.2s, v12.2s 862 trn2 v12.2s, v4.2s, v12.2s //row3 & 7 863 mov v4.8b, v21.8b 864 movi v28.8h, #2 865 trn1 v21.2s, v5.2s, v13.2s 866 trn2 v13.2s, v5.2s, v13.2s //row11 & row15 867 mov v5.8b, v21.8b 868 uaddl v16.8h, v6.8b, v8.8b //p0+q0 L 869 trn1 v21.2s, v2.2s, v10.2s 870 trn2 v10.2s, v2.2s, v10.2s //row2 &6 871 mov v2.8b, v21.8b 872 uaddl v18.8h, v7.8b, v9.8b //p0+q0 H 873 trn1 v21.2s, v3.2s, v11.2s 874 trn2 v11.2s, v3.2s, v11.2s //row10&row14 875 mov v3.8b, v21.8b 876 uaddw v20.8h, v16.8h , v4.8b //p0+q0+p1 L 877 uaddw v22.8h, v18.8h , v5.8b //p0+q0+p1 H 878 uaddl v24.8h, v2.8b, v10.8b //p2+q1 L 879 uaddl v26.8h, v3.8b, v11.8b //p2+q1 H 880 mla v24.8h, v20.8h , v28.8h //p2 + X2(p1) + X2(p0) + X2(q0) + q1 L 881 mla v26.8h, v22.8h , v28.8h //p2 + X2(p1) + X2(p0) + X2(q0) + q1 H 882 movi v28.16b, #2 883 uaddw v16.8h, v20.8h , v2.8b //p0+q0+p1+p2 L 884 uaddw v18.8h, v22.8h , v3.8b //p0+q0+p1+p2 H 885 dup v30.16b, w2 //duplicate alpha 886 rshrn v20.8b, v16.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)L p1' 887 rshrn v21.8b, v18.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)H p1' 888 mov v20.d[1] , v21.d[0] 889 mov v0.d[1] , v1.d[0] 890 mov v2.d[1] , v3.d[0] 891 mov v4.d[1] , v5.d[0] 892 mov v6.d[1] , v7.d[0] 893 mov v8.d[1] , v9.d[0] 894 mov v10.d[1] , v11.d[0] 895 mov v12.d[1] , v13.d[0] 896 mov v14.d[1] , v15.d[0] 897 uabd v22.16b , v6.16b, v8.16b 898 usra v28.16b, v30.16b, #2 //alpha >>2 +2 899 uabd v30.16b , v2.16b, v6.16b 900 rshrn v24.8b, v24.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0' 901 rshrn v25.8b, v26.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0' 902 mov v24.d[1] , v25.d[0] 903 dup v26.16b, w3 //beta 904 cmhi v28.16b, v28.16b , v22.16b //ABS(p0 - q0) <((Alpha >>2) + 2) 905 uaddl v22.8h, v6.8b, v10.8b //p0+q1 L 906 cmhi v14.16b, v26.16b , v30.16b //beta>Ap 907 uaddl v30.8h, v7.8b, v11.8b //p0+q1 H 908 uaddw v22.8h, v22.8h , v4.8b //p0+q1+p1 L 909 uaddw v30.8h, v30.8h , v5.8b //p0+q1+p1 H 910 uaddw v22.8h, v22.8h , v4.8b //p0+q1+2*p1 L 911 uaddw v30.8h, v30.8h , v5.8b //p0+q1+2*p1 H 912 and v14.16b, v14.16b , v28.16b //(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2) 913 rshrn v22.8b, v22.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) L p0" 914 rshrn v23.8b, v30.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) H p0" 915 mov v22.d[1] , v23.d[0] 916 uaddl v30.8h, v2.8b, v0.8b //p2+p3 L 917 bif v24.16b, v22.16b , v14.16b //p0' or p0 " 918 uaddl v22.8h, v3.8b, v1.8b //p2+p3 H 919 add v30.8h, v30.8h , v30.8h //2*(p2+p3) L 920 add v22.8h, v22.8h , v22.8h //2*(p2+p3)H 921 add v16.8h, v16.8h , v30.8h //(X2(p3) + X3(p2) + p1 + p0 + q0) L 922 add v18.8h, v18.8h , v22.8h //(X2(p3) + X3(p2) + p1 + p0 + q0) H 923 uabd v30.16b , v12.16b, v8.16b 924 uabd v22.16b , v10.16b, v8.16b 925 rshrn v16.8b, v16.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2' 926 rshrn v17.8b, v18.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2' 927 mov v16.d[1] , v17.d[0] 928 uabd v18.16b , v4.16b, v6.16b 929 cmhi v30.16b, v26.16b , v30.16b //Aq < Beta 930 cmhs v22.16b, v22.16b, v26.16b 931 cmhs v18.16b, v18.16b, v26.16b 932 dup v26.16b, w2 //duplicate alpha 933 and v30.16b, v30.16b , v28.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) 934 uabd v28.16b , v6.16b, v8.16b 935 orr v22.16b, v22.16b , v18.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta 936 uaddl v18.8h, v6.8b, v8.8b //p0+q0 L 937 cmhs v28.16b, v28.16b, v26.16b 938 uaddl v26.8h, v7.8b, v9.8b //p0+q0 H 939 uaddw v18.8h, v18.8h , v10.8b //p0+q0+q1 L 940 orr v22.16b, v22.16b , v28.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha 941 uaddw v26.8h, v26.8h , v11.8b //p0+q0+q1 H 942 bic v14.16b, v14.16b , v22.16b //final condn for p's 943 movi v28.16b, #2 944 bif v6.16b, v24.16b , v22.16b //final p0 945 bit v2.16b, v16.16b , v14.16b //final p2 946 bif v20.16b, v4.16b , v14.16b //final p1 947 mov v7.d[0] , v6.d[1] 948 mov v3.d[0] , v2.d[1] 949 mov v21.d[0] , v20.d[1] 950 uaddl v24.8h, v8.8b, v4.8b //q0+p1 L 951 umlal v24.8h, v10.8b, v28.8b //X2(q1) + q0 + p1 L 952 uaddl v16.8h, v9.8b, v5.8b //q0+p1 H 953 umlal v16.8h, v11.8b, v28.8b //X2(q1) + q0 + p1 H 954 movi v28.8h, #2 955 uaddl v14.8h, v4.8b, v12.8b //p1+q2 L 956 mla v14.8h, v18.8h , v28.8h //p1 + X2(p0) + X2(q0) + X2(q1) + q2L 957 uaddl v4.8h, v5.8b, v13.8b //p1+q2H 958 mla v4.8h, v26.8h , v28.8h //p1 + X2(p0) + X2(q0) + X2(q1) + q2H 959 rshrn v24.8b, v24.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; L q0' 960 rshrn v25.8b, v16.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; H q0' 961 mov v24.d[1] , v25.d[0] 962 uaddw v18.8h, v18.8h , v12.8b //p0 + q0 + q1 + q2 L 963 uaddw v26.8h, v26.8h , v13.8b //p0 + q0 + q1 + q2 H 964 rshrn v16.8b, v14.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo" 965 mov v14.16b, v31.16b 966 rshrn v17.8b, v4.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo" 967 mov v16.d[1] , v17.d[0] 968 rshrn v4.8b, v18.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 L q1' 969 rshrn v5.8b, v26.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 H q1' 970 mov v4.d[1] , v5.d[0] 971 bit v24.16b, v16.16b , v30.16b //q0' or q0" 972 bic v30.16b, v30.16b , v22.16b //final condn for q's 973 trn1 v31.8b, v0.8b, v2.8b 974 trn2 v2.8b, v0.8b, v2.8b //row1 &2 975 mov v0.8b, v31.8b 976 bit v10.16b, v4.16b , v30.16b 977 mov v11.d[0] , v10.d[1] 978 mov v25.d[0] , v24.d[1] 979 mov v31.d[0] , v30.d[1] 980 trn1 v31.8b, v1.8b, v3.8b 981 trn2 v3.8b, v1.8b, v3.8b //row9 &10 982 mov v1.8b, v31.8b 983 uaddl v16.8h, v12.8b, v14.8b //q2+q3 L 984 trn1 v31.8b, v20.8b, v6.8b 985 trn2 v6.8b, v20.8b, v6.8b //row3&row4 986 mov v20.8b , v31.8b 987 uaddl v4.8h, v13.8b, v15.8b //q2+q3 H 988 trn1 v31.8b, v21.8b, v7.8b 989 trn2 v7.8b, v21.8b, v7.8b //row11 & 12 990 mov v21.8b , v31.8b 991 mla v18.8h, v16.8h , v28.8h //X2(q3) + X3(q2) + q1 + q0 + p0 L 992 trn1 v31.4h, v2.4h, v6.4h 993 trn2 v6.4h, v2.4h, v6.4h //row2 & row4 994 mov v2.8b, v31.8b 995 mla v26.8h, v4.8h , v28.8h //X2(q3) + X3(q2) + q1 + q0 + p0 H 996 trn1 v31.4h, v3.4h, v7.4h 997 trn2 v7.4h, v3.4h, v7.4h //row10 & 12 998 mov v3.8b , v31.8b 999 bif v8.16b, v24.16b , v22.16b //final q0 1000 mov v9.d[0] , v8.d[1] 1001 trn1 v31.4h, v0.4h, v20.4h 1002 trn2 v20.4h, v0.4h, v20.4h //row1 & 3 1003 mov v0.8b , v31.8b 1004 rshrn v18.8b, v18.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L 1005 trn1 v31.4h, v1.4h, v21.4h 1006 trn2 v21.4h, v1.4h, v21.4h //row9 & row11 1007 mov v1.8b, v31.8b 1008 rshrn v19.8b, v26.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H 1009 mov v18.d[1] , v19.d[0] 1010 trn1 v31.8b, v8.8b, v10.8b 1011 trn2 v10.8b, v8.8b, v10.8b //row5&6 1012 mov v8.8b, v31.8b 1013 bit v12.16b, v18.16b , v30.16b //final q2 1014 mov v13.d[0] , v12.d[1] 1015 trn1 v31.8b, v9.8b, v11.8b 1016 trn2 v11.8b, v9.8b, v11.8b //row13 &14 1017 mov v9.8b, v31.8b 1018 trn1 v31.8b, v12.8b, v14.8b 1019 trn2 v14.8b, v12.8b, v14.8b //row7 & 8 1020 mov v12.8b, v31.8b 1021 trn1 v31.8b, v13.8b, v15.8b 1022 trn2 v15.8b, v13.8b, v15.8b //row15 & 16 1023 mov v13.8b , v31.8b 1024 trn1 v31.4h, v10.4h, v14.4h 1025 trn2 v14.4h, v10.4h, v14.4h //row6 & row8 1026 mov v10.8b, v31.8b 1027 trn1 v31.4h, v11.4h, v15.4h 1028 trn2 v15.4h, v11.4h, v15.4h //row14 & row16 1029 mov v11.8b, v31.8b 1030 //now Q3 ->p0 and Q7->q3 1031 trn1 v31.4h, v8.4h, v12.4h 1032 trn2 v12.4h, v8.4h, v12.4h //row 5 & 7 1033 mov v8.8b, v31.8b 1034 trn1 v31.4h, v9.4h, v13.4h 1035 trn2 v13.4h, v9.4h, v13.4h //row13 & row15 1036 mov v9.8b, v31.8b 1037 sub x0, x0, x1, lsl#4 //restore pointer 1038 trn1 v31.2s, v6.2s, v14.2s 1039 trn2 v14.2s, v6.2s, v14.2s //row4 & 8 1040 mov v6.8b , v31.8b 1041 trn1 v31.2s, v7.2s, v15.2s 1042 trn2 v15.2s, v7.2s, v15.2s //row 12 & 16 1043 mov v7.8b, v31.8b 1044 trn1 v31.2s, v0.2s, v8.2s 1045 trn2 v8.2s, v0.2s, v8.2s //row1 & row5 1046 mov v0.8b , v31.8b 1047 trn1 v31.2s, v1.2s, v9.2s 1048 trn2 v9.2s, v1.2s, v9.2s //row9 & 13 1049 mov v1.8b , v31.8b 1050 trn1 v31.2s, v2.2s, v10.2s 1051 trn2 v10.2s, v2.2s, v10.2s //row2 &6 1052 mov v2.8b , v31.8b 1053 trn1 v31.2s, v3.2s, v11.2s 1054 trn2 v11.2s, v3.2s, v11.2s //row10&row14 1055 mov v3.8b , v31.8b 1056 trn1 v31.2s, v20.2s, v12.2s 1057 trn2 v12.2s, v20.2s, v12.2s //row3 & 7 1058 mov v20.8b , v31.8b 1059 trn1 v31.2s, v21.2s, v13.2s 1060 trn2 v13.2s, v21.2s, v13.2s //row11 & row15 1061 mov v21.8b, v31.8b 1062 st1 {v0.8b}, [x0], x1 //row1 1063 st1 {v2.8b}, [x0], x1 //row2 1064 st1 {v20.8b}, [x0], x1 //row3 1065 st1 {v6.8b}, [x0], x1 //row4 1066 st1 {v8.8b}, [x0], x1 //row5 1067 st1 {v10.8b}, [x0], x1 //row6 1068 st1 {v12.8b}, [x0], x1 //row7 1069 st1 {v14.8b}, [x0], x1 //row8 1070 st1 {v1.8b}, [x0], x1 //row9 1071 st1 {v3.8b}, [x0], x1 //row10 1072 st1 {v21.8b}, [x0], x1 //row11 1073 st1 {v7.8b}, [x0], x1 //row12 1074 st1 {v9.8b}, [x0], x1 //row13 1075 st1 {v11.8b}, [x0], x1 //row14 1076 st1 {v13.8b}, [x0], x1 //row15 1077 st1 {v15.8b}, [x0], x1 //row16 1078 1079 // LDMFD sp!,{x12,pc} 1080 ldp x19, x20, [sp], #16 1081 pop_v_regs 1082 ret 1083 1084 1085