1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_intra_pred_chroma_ver_neon.s 22//* 23//* @brief 24//* contains function definitions for intra prediction dc filtering. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* @author 30//* yogeswaran rs 31//* 32//* @par list of functions: 33//* 34//* 35//* @remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41//******************************************************************************* 42//* 43//* @brief 44//* luma intraprediction filter for dc input 45//* 46//* @par description: 47//* 48//* @param[in] pu1_ref 49//* uword8 pointer to the source 50//* 51//* @param[out] pu1_dst 52//* uword8 pointer to the destination 53//* 54//* @param[in] src_strd 55//* integer source stride 56//* 57//* @param[in] dst_strd 58//* integer destination stride 59//* 60//* @param[in] nt 61//* size of tranform block 62//* 63//* @param[in] mode 64//* type of filtering 65//* 66//* @returns 67//* 68//* @remarks 69//* none 70//* 71//******************************************************************************* 72//*/ 73 74//void ihevc_intra_pred_chroma_ver(uword8 *pu1_ref, 75// word32 src_strd, 76// uword8 *pu1_dst, 77// word32 dst_strd, 78// word32 nt, 79// word32 mode) 80//**************variables vs registers***************************************** 81//x0 => *pu1_ref 82//x1 => src_strd 83//x2 => *pu1_dst 84//x3 => dst_strd 85 86//stack contents from #40 87// nt 88// mode 89 90.text 91.align 4 92.include "ihevc_neon_macros.s" 93 94 95.globl ihevc_intra_pred_chroma_ver_av8 96 97.type ihevc_intra_pred_chroma_ver_av8, %function 98 99ihevc_intra_pred_chroma_ver_av8: 100 101 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 102 push_v_regs 103 stp x19, x20,[sp,#-16]! 104 105 lsl x5, x4, #2 //4nt 106 107 108 cmp x4, #8 109 beq blk_8 110 blt blk_4 111 112copy_16: 113 add x5, x5, #2 //2nt+2 114 add x6, x0, x5 //&src[2nt+1] 115 116 add x5, x2, x3 //pu1_dst + dst_strd 117 ld2 {v20.8b, v21.8b}, [x6],#16 //16 loads (col 0:15) 118 add x8, x5, x3 119 120 add x10, x8, x3 121 ld2 {v22.8b, v23.8b}, [x6] //16 loads (col 16:31) 122 lsl x11, x3, #2 123 124 add x11, x11, #-16 125 126 127 st2 {v20.8b, v21.8b}, [x2],#16 128 st2 {v20.8b, v21.8b}, [x5],#16 129 st2 {v20.8b, v21.8b}, [x8],#16 130 st2 {v20.8b, v21.8b}, [x10],#16 131 132 st2 {v22.8b, v23.8b}, [x2], x11 133 st2 {v22.8b, v23.8b}, [x5], x11 134 st2 {v22.8b, v23.8b}, [x8], x11 135 st2 {v22.8b, v23.8b}, [x10], x11 136 137 subs x4, x4, #4 138 139kernel_copy_16: 140 st2 {v20.8b, v21.8b}, [x2],#16 141 st2 {v20.8b, v21.8b}, [x5],#16 142 st2 {v20.8b, v21.8b}, [x8],#16 143 st2 {v20.8b, v21.8b}, [x10],#16 144 145 st2 {v22.8b, v23.8b}, [x2], x11 146 st2 {v22.8b, v23.8b}, [x5], x11 147 st2 {v22.8b, v23.8b}, [x8], x11 148 st2 {v22.8b, v23.8b}, [x10], x11 149 150 subs x4, x4, #4 151 152 153 st2 {v20.8b, v21.8b}, [x2],#16 154 st2 {v20.8b, v21.8b}, [x5],#16 155 st2 {v20.8b, v21.8b}, [x8],#16 156 st2 {v20.8b, v21.8b}, [x10],#16 157 158 st2 {v22.8b, v23.8b}, [x2], x11 159 st2 {v22.8b, v23.8b}, [x5], x11 160 st2 {v22.8b, v23.8b}, [x8], x11 161 st2 {v22.8b, v23.8b}, [x10], x11 162 163 subs x4, x4, #4 164 165 st2 {v20.8b, v21.8b}, [x2],#16 166 st2 {v20.8b, v21.8b}, [x5],#16 167 st2 {v20.8b, v21.8b}, [x8],#16 168 st2 {v20.8b, v21.8b}, [x10],#16 169 170 st2 {v22.8b, v23.8b}, [x2], x11 171 st2 {v22.8b, v23.8b}, [x5], x11 172 st2 {v22.8b, v23.8b}, [x8], x11 173 st2 {v22.8b, v23.8b}, [x10], x11 174 175 subs x4, x4, #4 176 bne kernel_copy_16 177 178 b end_func 179 180blk_8: 181 182 add x5, x5, #2 //2nt+2 183 add x6, x0, x5 //&src[2nt+1] 184 185 add x5, x2, x3 //pu1_dst + dst_strd 186 ld2 {v20.8b, v21.8b}, [x6],#16 //16 loads (col 0:15) 187 add x8, x5, x3 188 189 add x10, x8, x3 190 ld2 {v22.8b, v23.8b}, [x6] //16 loads (col 16:31) 191 192 lsl x11,x3,#2 193 194 st2 {v20.8b, v21.8b}, [x2],x11 195 st2 {v20.8b, v21.8b}, [x5],x11 196 st2 {v20.8b, v21.8b}, [x8],x11 197 st2 {v20.8b, v21.8b}, [x10],x11 198 199 st2 {v20.8b, v21.8b}, [x2] 200 st2 {v20.8b, v21.8b}, [x5] 201 st2 {v20.8b, v21.8b}, [x8] 202 st2 {v20.8b, v21.8b}, [x10] 203 204 subs x4, x4, #8 205 beq end_func 206 207blk_4: 208 209 //lsl x5, x4, #2 @4nt 210 add x5, x5, #2 //2nt+2 211 add x6, x0, x5 //&src[2nt+1] 212 213 ld1 {v0.8b},[x6] 214 add x5, x2, x3 //pu1_dst + dst_strd 215 216 st1 {v0.8b},[x2] 217 add x8, x5, x3 218 st1 {v0.8b},[x5] 219 add x10, x8, x3 220 st1 {v0.8b},[x8] 221 st1 {v0.8b},[x10] 222 223 224 225end_func: 226 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 227 ldp x19, x20,[sp],#16 228 pop_v_regs 229 ret 230 231 232 233