1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//****************************************************************************** 20//* //file 21//* ihevc_inter_pred_filters_luma_vert_w16inp.s 22//* 23//* //brief 24//* contains function definitions for inter prediction interpolation. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* //author 30//* yogeswaran rs 31//* 32//* //par list of functions: 33//* 34//* - ihevc_inter_pred_luma_vert() 35//* 36//* //remarks 37//* none 38//* 39//******************************************************************************* 40//*/ 41 42///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */ 43///* include reconstruction */ 44// 45 46///** 47//******************************************************************************* 48//* 49//* //brief 50//* luma vertical filter for 16bit input. 51//* 52//* //par description: 53//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 54//* the elements pointed by 'pu1_src' and writes to the location pointed by 55//* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and 56//* clipped to lie between 0 and 255 assumptions : the function is 57//* optimized considering the fact width is multiple of 4. and height as 58//* multiple of 2. 59//* 60//* //param[in] pi2_src 61//* word16 pointer to the source 62//* 63//* //param[out] pu1_dst 64//* uword8 pointer to the destination 65//* 66//* //param[in] src_strd 67//* integer source stride 68//* 69//* //param[in] dst_strd 70//* integer destination stride 71//* 72//* //param[in] pi1_coeff 73//* word8 pointer to the filter coefficients 74//* 75//* //param[in] ht 76//* integer height of the array 77//* 78//* //param[in] wd 79//* integer width of the array 80//* 81//* //returns 82//* 83//* //remarks 84//* none 85//* 86//******************************************************************************* 87//*/ 88 89//void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src, 90// uword8 *pu1_dst, 91// word32 src_strd, 92// word32 dst_strd, 93// word8 *pi1_coeff, 94// word32 ht, 95// word32 wd ) 96//**************variables vs registers***************************************** 97// r0 => *pu2_src 98// r1 => *pu1_dst 99// r2 => src_strd 100// r3 => dst_strd 101// r4 => *pi1_coeff 102// r5 => ht 103// r6 => wd 104 105.text 106.align 4 107 108.include "ihevc_neon_macros.s" 109 110.globl ihevc_inter_pred_luma_vert_w16inp_w16out_av8 111 112.type ihevc_inter_pred_luma_vert_w16inp_w16out_av8, %function 113 114ihevc_inter_pred_luma_vert_w16inp_w16out_av8: 115 116 //stmfd sp!, {r4-r12, r14} //stack stores the values of the arguments 117 118 stp x19,x20,[sp, #-16]! 119 120 mov x15,x4 // pi1_coeff 121 mov x16,x5 // ht 122 mov x17,x6 // wd 123 124 125 mov x12,x15 //load pi1_coeff 126 lsl x6,x3,#1 127 mov x5,x17 //load wd 128 ld1 {v0.8b},[x12] //coeff = ld1_s8(pi1_coeff) 129 lsl x2, x2,#1 130 sub x12,x2,x2,lsl #2 //src_ctrd & pi1_coeff 131 //vabs.s8 d0,d0 //vabs_s8(coeff) 132 add x0,x0,x12 //r0->pu1_src r12->pi1_coeff 133 mov x3,x16 //load ht 134 subs x7,x3,#0 //r3->ht 135 //ble end_loops //end loop jump 136 sxtl v0.8h,v0.8b 137 dup v22.4h,v0.h[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)// 138 dup v23.4h,v0.h[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)// 139 dup v24.4h,v0.h[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)// 140 dup v25.4h,v0.h[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)// 141 dup v26.4h,v0.h[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)// 142 dup v27.4h,v0.h[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)// 143 dup v28.4h,v0.h[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)// 144 dup v29.4h,v0.h[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)// 145 movi v30.4s,#8, lsl #16 146 147 sub x9,x5,x6,lsl #2 //r6->dst_strd r5 ->wd 148 neg x9,x9 149 sub x8,x5,x2,lsl #2 //r2->src_strd 150 neg x8,x8 151 sub x8,x8,x5 152 sub x9,x9,x5 153 lsr x3, x5, #2 //divide by 4 154 mul x7, x7, x3 //multiply height by width 155 sub x7, x7, #4 //subtract by one for epilog 156 mov x4,x5 //r5 ->wd 157 //mov r2, r2, lsl #1 158 159prolog: 160 161 add x3,x0,x2 //pu1_src_tmp += src_strd// 162 ld1 {v1.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// 163 ld1 {v0.4h},[x0], #8 //src_tmp1 = ld1_u8(pu1_src_tmp)// 164 subs x4,x4,#4 165 ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// 166 smull v19.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)// 167 ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)// 168 smlal v19.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)// 169 ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)// 170 smlal v19.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)// 171 ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// 172 smlal v19.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)// 173 ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// 174 smlal v19.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)// 175 ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)// 176 smlal v19.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)// 177 smlal v19.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)// 178 smlal v19.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)// 179 180 ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)// 181 182 smull v20.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)// 183 add x20,x0,x8,lsl #0 184 csel x0,x20,x0,le 185 smlal v20.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)// 186 csel x4,x5,x4,le 187 smlal v20.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)// 188 ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// 189 smlal v20.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)// 190 ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// 191 smlal v20.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)// 192 add x3,x0,x2 //pu1_src_tmp += src_strd// 193 smlal v20.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)// 194 smlal v20.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)// 195 smlal v20.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)// 196 sub v19.4s, v19.4s, v30.4s 197 198 ld1 {v1.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// 199 smull v21.4s,v3.4h,v23.4h 200 ld1 {v0.4h},[x0],#8 //src_tmp1 = ld1_u8(pu1_src_tmp)// 201 smlal v21.4s,v2.4h,v22.4h 202 ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// 203 smlal v21.4s,v4.4h,v24.4h 204 smlal v21.4s,v5.4h,v25.4h 205 smlal v21.4s,v6.4h,v26.4h 206 smlal v21.4s,v7.4h,v27.4h 207 smlal v21.4s,v16.4h,v28.4h 208 smlal v21.4s,v17.4h,v29.4h 209 add x14,x1,x6 210 sub v20.4s, v20.4s, v30.4s 211 shrn v19.4h, v19.4s, #6 212 //vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)// 213 214 smull v31.4s,v4.4h,v23.4h 215 smlal v31.4s,v3.4h,v22.4h 216 smlal v31.4s,v5.4h,v24.4h 217 smlal v31.4s,v6.4h,v25.4h 218 ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)// 219 smlal v31.4s,v7.4h,v26.4h 220 ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)// 221 smlal v31.4s,v16.4h,v27.4h 222 ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// 223 smlal v31.4s,v17.4h,v28.4h 224 ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// 225 smlal v31.4s,v18.4h,v29.4h 226 ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)// 227 228 st1 {v19.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)// 229 sub v21.4s, v21.4s, v30.4s 230 shrn v20.4h, v20.4s, #6 231 //vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)// 232 add x20, x1, x9 233 csel x1, x20, x1, le 234 235 subs x7,x7,#4 236 237 238 blt epilog_end //jumps to epilog_end 239 beq epilog //jumps to epilog 240 241kernel_8: 242 243 smull v19.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)// 244 subs x4,x4,#4 245 smlal v19.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)// 246 add x20,x0,x8,lsl #0 247 csel x0,x20,x0,le 248 smlal v19.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)// 249 smlal v19.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)// 250 smlal v19.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)// 251 smlal v19.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)// 252 smlal v19.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)// 253 smlal v19.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)// 254 st1 {v20.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)// 255 256 sub v31.4S, v31.4s, v30.4s 257 shrn v21.4h, v21.4s, #6 258 //vqrshrun d12,q6,#6 259 ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)// 260 261 smull v20.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)// 262 smlal v20.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)// 263 smlal v20.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)// 264 smlal v20.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)// 265 smlal v20.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)// 266 smlal v20.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)// 267 st1 {v21.2s},[x14],x6 268 269 smlal v20.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)// 270 ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// 271 272 smlal v20.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)// 273 274 sub v19.4s, v19.4s, v30.4s 275 shrn v31.4h, v31.4s, #6 276 //vqrshrun d14,q7,#6 277 278 smull v21.4s,v3.4h,v23.4h 279 csel x4,x5,x4,le 280 281 smlal v21.4s,v2.4h,v22.4h 282 ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// 283 284 smlal v21.4s,v4.4h,v24.4h 285 add x3,x0,x2 //pu1_src_tmp += src_strd// 286 287 smlal v21.4s,v5.4h,v25.4h 288 289 smlal v21.4s,v6.4h,v26.4h 290 st1 {v31.2s},[x14],x6 291 292 smlal v21.4s,v7.4h,v27.4h 293 ld1 {v1.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// 294 295 smlal v21.4s,v16.4h,v28.4h 296 add x14,x1,x6 297 298 smlal v21.4s,v17.4h,v29.4h 299 ld1 {v0.4h},[x0],#8 //src_tmp1 = ld1_u8(pu1_src_tmp)// 300 301 sub v20.4s, v20.4s, v30.4s 302 shrn v19.4h, v19.4s, #6 303 //vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)// 304 ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// 305 306 smull v31.4s,v4.4h,v23.4h 307 smlal v31.4s,v3.4h,v22.4h 308 smlal v31.4s,v5.4h,v24.4h 309 ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)// 310 311 smlal v31.4s,v6.4h,v25.4h 312 ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)// 313 smlal v31.4s,v7.4h,v26.4h 314 ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// 315 smlal v31.4s,v16.4h,v27.4h 316 ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// 317 smlal v31.4s,v17.4h,v28.4h 318 ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)// 319 smlal v31.4s,v18.4h,v29.4h 320 st1 {v19.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)// 321 322 sub v21.4s, v21.4s, v30.4s 323 shrn v20.4h, v20.4s, #6 324 add x20, x1, x9 325 csel x1, x20, x1, le 326 327 //vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)// 328 subs x7,x7,#4 329 330 bgt kernel_8 //jumps to kernel_8 331 332epilog: 333 334 smull v19.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)// 335 smlal v19.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)// 336 smlal v19.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)// 337 smlal v19.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)// 338 smlal v19.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)// 339 smlal v19.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)// 340 smlal v19.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)// 341 smlal v19.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)// 342 st1 {v20.2s},[x14],x6 343 344 sub v31.4s, v31.4s, v30.4s 345 shrn v21.4h, v21.4s, #6 346 //vqrshrun d12,q6,#6 347 348 ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)// 349 smull v20.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)// 350 smlal v20.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)// 351 smlal v20.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)// 352 smlal v20.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)// 353 smlal v20.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)// 354 smlal v20.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)// 355 smlal v20.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)// 356 smlal v20.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)// 357 st1 {v21.2s},[x14],x6 358 359 sub v19.4s, v19.4s, v30.4s 360 shrn v31.4h, v31.4s, #6 361 //vqrshrun d14,q7,#6 362 363 ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// 364 smull v21.4s,v3.4h,v23.4h 365 smlal v21.4s,v2.4h,v22.4h 366 smlal v21.4s,v4.4h,v24.4h 367 smlal v21.4s,v5.4h,v25.4h 368 smlal v21.4s,v6.4h,v26.4h 369 smlal v21.4s,v7.4h,v27.4h 370 smlal v21.4s,v16.4h,v28.4h 371 smlal v21.4s,v17.4h,v29.4h 372 st1 {v31.2s},[x14],x6 373 sub v20.4s, v20.4s, v30.4s 374 shrn v19.4h, v19.4s, #6 375 //vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)// 376 377 ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// 378 smull v31.4s,v4.4h,v23.4h 379 smlal v31.4s,v3.4h,v22.4h 380 smlal v31.4s,v5.4h,v24.4h 381 smlal v31.4s,v6.4h,v25.4h 382 smlal v31.4s,v7.4h,v26.4h 383 smlal v31.4s,v16.4h,v27.4h 384 smlal v31.4s,v17.4h,v28.4h 385 smlal v31.4s,v18.4h,v29.4h 386 sub v21.4s, v21.4s, v30.4s 387 shrn v20.4h, v20.4s, #6 388 //vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)// 389 390 add x14,x1,x6 391 st1 {v19.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)// 392 393epilog_end: 394 st1 {v20.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)// 395 shrn v21.4h, v21.4s, #6 396 //vqrshrun d12,q6,#6 397 398 st1 {v21.2s},[x14],x6 399 sub v31.4s, v31.4s, v30.4s 400 shrn v31.4h, v31.4s, #6 401 //vqrshrun d14,q7,#6 402 403 st1 {v31.2s},[x14],x6 404 405 406end_loops: 407 408 //ldmfd sp!,{r4-r12,r15} //reload the registers from sp 409 ldp x19, x20,[sp], #16 410 411 ret 412 413 414 415 416 417 418 419