1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_intra_pred_luma_mode_18_34_neon.s 22//* 23//* @brief 24//* contains function definitions for intra prediction dc filtering. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* @author 30//* yogeswaran rs 31//* 32//* @par list of functions: 33//* 34//* 35//* @remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41//******************************************************************************* 42//* 43//* @brief 44//* luma intraprediction filter for dc input 45//* 46//* @par description: 47//* 48//* @param[in] pu1_ref 49//* uword8 pointer to the source 50//* 51//* @param[out] pu1_dst 52//* uword8 pointer to the destination 53//* 54//* @param[in] src_strd 55//* integer source stride 56//* 57//* @param[in] dst_strd 58//* integer destination stride 59//* 60//* @param[in] pi1_coeff 61//* word8 pointer to the planar coefficients 62//* 63//* @param[in] nt 64//* size of tranform block 65//* 66//* @param[in] mode 67//* type of filtering 68//* 69//* @returns 70//* 71//* @remarks 72//* none 73//* 74//******************************************************************************* 75//*/ 76 77//void ihevc_intra_pred_luma_mode_18_34(uword8 *pu1_ref, 78// word32 src_strd, 79// uword8 *pu1_dst, 80// word32 dst_strd, 81// word32 nt, 82// word32 mode) 83// 84//**************variables vs registers***************************************** 85//x0 => *pu1_ref 86//x1 => src_strd 87//x2 => *pu1_dst 88//x3 => dst_strd 89 90//stack contents from #40 91// nt 92// mode 93// pi1_coeff 94 95.text 96.align 4 97.include "ihevc_neon_macros.s" 98 99 100 101.globl ihevc_intra_pred_luma_mode_18_34_av8 102 103.type ihevc_intra_pred_luma_mode_18_34_av8, %function 104 105ihevc_intra_pred_luma_mode_18_34_av8: 106 107 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 108 push_v_regs 109 stp x19, x20,[sp,#-16]! 110 111 cmp x4,#4 112 beq mode2_4 113 114 mov x11,x4 115 mov x12,x4 116 sub x14,x4,#8 117 118 add x0,x0,x4,lsl #1 119 120 cmp x5,#0x22 121 mov x10,x2 122 123 add x0,x0,#2 124 sub x20,x0,#2 125 csel x0, x20, x0,ne 126 mov x20,#1 127 csel x6, x20, x6,eq 128 mov x20,#-1 129 csel x6, x20, x6,ne 130 mov x8,x0 131 132prologue_cpy_32: 133 134 ld1 {v0.8b},[x8],x6 135 lsr x1, x4, #3 136 ld1 {v1.8b},[x8],x6 137 mul x1, x4, x1 138 ld1 {v2.8b},[x8],x6 139 ld1 {v3.8b},[x8],x6 140 subs x1,x1,#8 141 ld1 {v4.8b},[x8],x6 142 ld1 {v5.8b},[x8],x6 143 ld1 {v6.8b},[x8],x6 144 145 ld1 {v7.8b},[x8],x6 146 147 148 beq epilogue_mode2 149 sub x11,x11,#8 150 151 cmp x5,#0x22 152 add x20,x0,#8 153 csel x0, x20, x0,ne 154 csel x8, x0, x8,ne 155 bne kernel_mode18 156 //add x8,x0,#8 157 158kernel_mode2: 159 st1 {v0.8b},[x10],x3 160 st1 {v1.8b},[x10],x3 161 subs x12,x12,#8 162 st1 {v2.8b},[x10],x3 163 add x20,x2,#8 164 csel x2, x20, x2,ne 165 st1 {v3.8b},[x10],x3 166 167 ld1 {v0.8b},[x8],x6 168 st1 {v4.8b},[x10],x3 169 170 st1 {v5.8b},[x10],x3 171 ld1 {v1.8b},[x8],x6 172 st1 {v6.8b},[x10],x3 173 ld1 {v2.8b},[x8],x6 174 st1 {v7.8b},[x10],x3 175 176 ld1 {v3.8b},[x8],x6 177 sub x20,x10,x14 178 csel x2, x20, x2,eq 179 ld1 {v4.8b},[x8],x6 180 mov x10,x2 181 ld1 {v5.8b},[x8],x6 182 csel x12, x4, x12,eq 183 ld1 {v6.8b},[x8],x6 184 subs x11,x11,#8 185 186 ld1 {v7.8b},[x8],x6 187 188 add x20,x0,#8 189 csel x0, x20, x0,eq 190 csel x11, x4, x11,eq 191 csel x8, x0, x8,eq 192 193 subs x1, x1, #8 194 195 bne kernel_mode2 196 197 b epilogue_mode2 198 199kernel_mode18: 200 st1 {v0.8b},[x10],x3 201 st1 {v1.8b},[x10],x3 202 subs x12,x12,#8 203 st1 {v2.8b},[x10],x3 204 add x20,x2,#8 205 csel x2, x20, x2,ne 206 st1 {v3.8b},[x10],x3 207 208 ld1 {v0.8b},[x8],x6 209 st1 {v4.8b},[x10],x3 210 211 st1 {v5.8b},[x10],x3 212 ld1 {v1.8b},[x8],x6 213 214 st1 {v6.8b},[x10],x3 215 ld1 {v2.8b},[x8],x6 216 st1 {v7.8b},[x10],x3 217 218 ld1 {v3.8b},[x8],x6 219 sub x20,x10,x14 220 csel x2, x20, x2,eq 221 ld1 {v4.8b},[x8],x6 222 mov x10,x2 223 ld1 {v5.8b},[x8],x6 224 csel x12, x4, x12,eq 225 ld1 {v6.8b},[x8],x6 226 subs x11,x11,#8 227 ld1 {v7.8b},[x8],x6 228 229 add x20,x0,#8 230 csel x0, x20, x0,ne 231 csel x11, x4, x11,eq 232 sub x20,x8,x14 233 csel x0, x20, x0,eq 234 subs x1, x1, #8 235 mov x8,x0 236 237 bne kernel_mode18 238 239 240epilogue_mode2: 241 242 st1 {v0.8b},[x10],x3 243 st1 {v1.8b},[x10],x3 244 st1 {v2.8b},[x10],x3 245 st1 {v3.8b},[x10],x3 246 st1 {v4.8b},[x10],x3 247 st1 {v5.8b},[x10],x3 248 st1 {v6.8b},[x10],x3 249 st1 {v7.8b},[x10],x3 250 251 b end_func 252 253mode2_4: 254 255 add x0,x0,#10 256 cmp x5,#0x22 257 sub x20,x0,#2 258 csel x0, x20, x0,ne 259 260 mov x20,#1 261 csel x8, x20, x8,eq 262 mov x20,#-1 263 csel x8, x20, x8,ne 264 265 ld1 {v0.8b},[x0],x8 266 st1 {v0.s}[0],[x2],x3 267 268 ld1 {v0.8b},[x0],x8 269 st1 {v0.s}[0],[x2],x3 270 271 ld1 {v0.8b},[x0],x8 272 st1 {v0.s}[0],[x2],x3 273 274 ld1 {v0.8b},[x0],x8 275 st1 {v0.s}[0],[x2],x3 276 277end_func: 278 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 279 ldp x19, x20,[sp],#16 280 pop_v_regs 281 ret 282 283 284 285 286 287 288 289