1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_intra_pred_luma_horz_neon.s 22//* 23//* @brief 24//* contains function definition for intra prediction interpolation filters 25//* 26//* 27//* @author 28//* parthiban v 29//* 30//* @par list of functions: 31//* - ihevc_intra_pred_luma_horz() 32//* 33//* @remarks 34//* none 35//* 36//******************************************************************************* 37//*/ 38// 39///** 40//******************************************************************************* 41//* 42//* @brief 43//* intra prediction interpolation filter for horizontal luma variable. 44//* 45//* @par description: 46//* horizontal intraprediction(mode 10) with.extern samples location 47//* pointed by 'pu1_ref' to the tu block location pointed by 'pu1_dst' refer 48//* to section 8.4.4.2.6 in the standard (special case) 49//* 50//* @param[in] pu1_src 51//* uword8 pointer to the source 52//* 53//* @param[out] pu1_dst 54//* uword8 pointer to the destination 55//* 56//* @param[in] src_strd 57//* integer source stride 58//* 59//* @param[in] dst_strd 60//* integer destination stride 61//* 62//* @param[in] nt 63//* integer transform block size 64//* 65//* @param[in] mode 66//* integer intraprediction mode 67//* 68//* @returns 69//* 70//* @remarks 71//* none 72//* 73//******************************************************************************* 74//*/ 75//void ihevc_intra_pred_luma_horz(uword8 *pu1_ref, 76// word32 src_strd, 77// uword8 *pu1_dst, 78// word32 dst_strd, 79// word32 nt, 80// word32 mode) 81//**************variables vs registers***************************************** 82//x0 => *pu1_ref 83//x1 => src_strd 84//x2 => *pu1_dst 85//x3 => dst_strd 86 87.text 88.align 4 89.include "ihevc_neon_macros.s" 90 91 92 93.globl ihevc_intra_pred_luma_horz_av8 94 95.type ihevc_intra_pred_luma_horz_av8, %function 96 97ihevc_intra_pred_luma_horz_av8: 98 99 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 100 101 stp x19, x20,[sp,#-16]! 102 103 //ldr x5,[sp,#44] @loads mode 104 105 lsl x6,x4,#1 //two_nt 106 107 add x12,x0,x6 //*pu1_ref[two_nt] 108 cmp x4,#4 //if nt == 4 109 beq core_loop_4 110 111 cmp x4,#8 //if nt == 8 112 beq core_loop_8 113 114 cmp x4,#16 //if nt == 16 115 beq core_loop_16 116 sub x12,x12,#16 //move to 16th value pointer 117 add x9,x2,#16 118 119core_loop_32: 120 ld1 { v0.16b},[x12] //load 16 values. d1[7] will have the 1st value. 121 122 dup v2.16b, v0.b[15] //duplicate the i value. 123 124 dup v4.16b, v0.b[14] //duplicate the ii value. 125 dup v6.16b, v0.b[13] //duplicate the iii value. 126 st1 { v2.16b},[x2],x3 //store in 1st row 0-16 columns 127 st1 { v2.16b},[x9],x3 //store in 1st row 16-32 columns 128 129 dup v1.16b, v0.b[12] 130 st1 { v4.16b},[x2],x3 131 st1 { v4.16b},[x9],x3 132 133 dup v2.16b, v0.b[11] 134 st1 { v6.16b},[x2],x3 135 st1 { v6.16b},[x9],x3 136 137 dup v4.16b, v0.b[10] 138 st1 { v1.16b},[x2],x3 139 st1 { v1.16b},[x9],x3 140 141 dup v6.16b, v0.b[9] 142 st1 { v2.16b},[x2],x3 143 st1 { v2.16b},[x9],x3 144 145 dup v1.16b, v0.b[8] 146 st1 { v4.16b},[x2],x3 147 st1 { v4.16b},[x9],x3 148 149 dup v2.16b, v0.b[7] 150 st1 { v6.16b},[x2],x3 151 st1 { v6.16b},[x9],x3 152 153 dup v4.16b, v0.b[6] 154 st1 { v1.16b},[x2],x3 155 st1 { v1.16b},[x9],x3 156 157 dup v6.16b, v0.b[5] 158 st1 { v2.16b},[x2],x3 159 st1 { v2.16b},[x9],x3 160 161 dup v1.16b, v0.b[4] 162 st1 { v4.16b},[x2],x3 163 st1 { v4.16b},[x9],x3 164 165 dup v2.16b, v0.b[3] 166 st1 { v6.16b},[x2],x3 167 st1 { v6.16b},[x9],x3 168 169 dup v4.16b, v0.b[2] 170 st1 { v1.16b},[x2],x3 171 st1 { v1.16b},[x9],x3 172 173 dup v6.16b, v0.b[1] 174 st1 { v2.16b},[x2],x3 175 st1 { v2.16b},[x9],x3 176 sub x12,x12,#16 //move to 16th value pointer 177 178 dup v1.16b, v0.b[0] 179 st1 { v4.16b},[x2],x3 180 st1 { v4.16b},[x9],x3 181 182 subs x4,x4,#16 //decrement the loop count by 16 183 st1 { v6.16b},[x2],x3 184 st1 { v6.16b},[x9],x3 185 186 st1 { v1.16b},[x2],x3 187 st1 { v1.16b},[x9],x3 188 bgt core_loop_32 189 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 190 ldp x19, x20,[sp],#16 191 192 ret 193 b end_func 194 195core_loop_16: 196 ldrb w14,[x12],#1 //pu1_ref[two_nt] 197 sxtw x14,w14 198 ld1 { v30.8b},[x12],#8 //pu1_ref[two_nt + 1 + col] 199 ld1 { v31.8b},[x12] //pu1_ref[two_nt + 1 + col] 200 sub x12,x12,#8 201 202 dup v28.8b,w14 203 sub x12,x12,#17 204 ld1 { v0.16b},[x12] 205 dup v26.8b, v0.b[15] 206 uxtl v26.8h, v26.8b 207 208 dup v2.16b, v0.b[14] 209 usubl v24.8h, v30.8b, v28.8b 210 211 dup v4.16b, v0.b[13] 212 sshr v24.8h, v24.8h,#1 213 214 dup v6.16b, v0.b[12] 215 sqadd v22.8h, v26.8h , v24.8h 216 217 dup v1.16b, v0.b[11] 218 sqxtun v22.8b, v22.8h 219 220 st1 {v22.8b},[x2],#8 221 222 dup v18.16b, v0.b[10] 223 usubl v24.8h, v31.8b, v28.8b 224 225 dup v19.16b, v0.b[9] 226 sshr v24.8h, v24.8h,#1 227 228 dup v20.16b, v0.b[8] 229 sqadd v22.8h, v26.8h , v24.8h 230 231 dup v16.16b, v0.b[7] 232 sqxtun v22.8b, v22.8h 233 234 st1 {v22.8b},[x2],x3 235 sub x2,x2,#8 236 237 st1 { v2.16b},[x2],x3 238 239 st1 { v4.16b},[x2],x3 240 st1 { v6.16b},[x2],x3 241 st1 { v1.16b},[x2],x3 242 243 dup v2.16b, v0.b[6] 244 st1 { v18.16b},[x2],x3 245 246 dup v4.16b, v0.b[5] 247 st1 { v19.16b},[x2],x3 248 249 dup v6.16b, v0.b[4] 250 st1 { v20.16b},[x2],x3 251 252 dup v1.16b, v0.b[3] 253 st1 { v16.16b},[x2],x3 254 255 dup v18.16b, v0.b[2] 256 st1 { v2.16b},[x2],x3 257 258 dup v19.16b, v0.b[1] 259 st1 { v4.16b},[x2],x3 260 261 dup v20.16b, v0.b[0] 262 st1 { v6.16b},[x2],x3 263 264 st1 { v1.16b},[x2],x3 265 st1 { v18.16b},[x2],x3 266 st1 { v19.16b},[x2],x3 267 st1 { v20.16b},[x2],x3 268 269 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 270 ldp x19, x20,[sp],#16 271 272 ret 273 b end_func 274 275 276core_loop_8: 277 ldrb w14,[x12] //pu1_ref[two_nt] 278 sxtw x14,w14 279 add x12,x12,#1 //pu1_ref[two_nt + 1] 280 ld1 {v30.8b},[x12] //pu1_ref[two_nt + 1 + col] 281 282 sub x12,x12,#9 283 ld1 {v0.8b},[x12] 284 dup v26.8b, v0.b[7] 285 dup v28.8b,w14 286 287 dup v3.8b, v0.b[6] 288 uxtl v26.8h, v26.8b 289 290 dup v4.8b, v0.b[5] 291 usubl v24.8h, v30.8b, v28.8b 292 293 dup v5.8b, v0.b[4] 294 sshr v24.8h, v24.8h,#1 295 296 dup v6.8b, v0.b[3] 297 sqadd v22.8h, v26.8h , v24.8h 298 299 dup v7.8b, v0.b[2] 300 sqxtun v22.8b, v22.8h 301 302 st1 {v22.8b},[x2],x3 303 st1 {v3.8b},[x2],x3 304 305 dup v1.8b, v0.b[1] 306 st1 {v4.8b},[x2],x3 307 st1 {v5.8b},[x2],x3 308 309 dup v17.8b, v0.b[0] 310 st1 {v6.8b},[x2],x3 311 st1 {v7.8b},[x2],x3 312 313 st1 {v1.8b},[x2],x3 314 st1 {v17.8b},[x2],x3 315 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 316 ldp x19, x20,[sp],#16 317 318 ret 319 b end_func 320 321 322core_loop_4: 323 ldrb w14,[x12] //pu1_ref[two_nt] 324 sxtw x14,w14 325 add x12,x12,#1 //pu1_ref[two_nt + 1] 326 ld1 {v30.8b},[x12] //pu1_ref[two_nt + 1 + col] 327 328 sub x12,x12,#5 329 ld1 {v0.8b},[x12] 330 dup v28.8b,w14 331 dup v26.8b, v0.b[3] 332 uxtl v26.8h, v26.8b 333 334 dup v3.8b, v0.b[2] 335 usubl v24.8h, v30.8b, v28.8b 336 337 dup v4.8b, v0.b[1] 338 sshr v24.8h, v24.8h,#1 339 340 dup v5.8b, v0.b[0] 341 sqadd v22.8h, v26.8h , v24.8h 342 343 sqxtun v22.8b, v22.8h 344 345 st1 {v22.s}[0],[x2],x3 346 st1 {v3.s}[0],[x2],x3 347 st1 {v4.s}[0],[x2],x3 348 st1 {v5.s}[0],[x2],x3 349 350 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 351 ldp x19, x20,[sp],#16 352 353 ret 354end_func: 355 356 357 358