1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21///** 22//******************************************************************************* 23//* 24//* @brief 25//* Interprediction luma function for copy 26//* 27//* @par Description: 28//* Copies the array of width 'wd' and height 'ht' from the location pointed 29//* by 'src' to the location pointed by 'dst' 30//* 31//* @param[in] pu1_src 32//* UWORD8 pointer to the source 33//* 34//* @param[out] pu1_dst 35//* UWORD8 pointer to the destination 36//* 37//* @param[in] src_strd 38//* integer source stride 39//* 40//* @param[in] dst_strd 41//* integer destination stride 42//* 43//* 44//* @param[in] ht 45//* integer height of the array 46//* 47//* @param[in] wd 48//* integer width of the array 49//* 50//* @returns 51//* 52//* @remarks 53//* None 54//* 55//******************************************************************************* 56//*/ 57//void ih264_inter_pred_luma_copy ( 58// UWORD8 *pu1_src, 59// UWORD8 *pu1_dst, 60// WORD32 src_strd, 61// WORD32 dst_strd, 62// WORD32 ht, 63// WORD32 wd ) 64 65//**************Variables Vs Registers***************************************** 66// x0 => *pu1_src 67// x1 => *pu1_dst 68// x2 => src_strd 69// x3 => dst_strd 70// x7 => ht 71// x12 => wd 72 73.text 74.p2align 2 75.include "ih264_neon_macros.s" 76 77 78 79 .global ih264_inter_pred_luma_copy_av8 80 81ih264_inter_pred_luma_copy_av8: 82 83 push_v_regs 84 stp x19, x20, [sp, #-16]! 85 86 mov x12, x5 87 mov x7, x4 88 cmp x7, #0 //checks ht == 0 89 ble end_loops 90 tst x12, #15 //checks wd for multiples for 4 & 8 91 beq core_loop_wd_16 92 tst x12, #7 //checks wd for multiples for 4 & 8 93 beq core_loop_wd_8 94 sub x11, x12, #4 95 96outer_loop_wd_4: 97 subs x4, x12, #0 //checks wd == 0 98 ble end_inner_loop_wd_4 99 100inner_loop_wd_4: 101 ld1 {v0.s}[0], [x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 102 add x5, x0, x2 //pu1_src_tmp += src_strd 103 add x6, x1, x3 //pu1_dst_tmp += dst_strd 104 st1 {v0.s}[0], [x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 105 ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 106 add x0, x0, #4 //pu1_src += 4 107 st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 108 ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 109 subs x4, x4, #4 //(wd -4) 110 st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 111 ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 112 add x1, x1, #4 //pu1_dst += 4 113 st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 114 115 bgt inner_loop_wd_4 116 117end_inner_loop_wd_4: 118 subs x7, x7, #4 //ht - 4 119 sub x0, x5, x11 //pu1_src = pu1_src_tmp 120 sub x1, x6, x11 //pu1_dst = pu1_dst_tmp 121 bgt outer_loop_wd_4 122 123end_loops: 124 // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 125 ldp x19, x20, [sp], #16 126 pop_v_regs 127 ret 128 129 130core_loop_wd_8: 131 sub x11, x12, #8 132 133outer_loop_wd_8: 134 subs x4, x12, #0 //checks wd 135 ble end_inner_loop_wd_8 136 137inner_loop_wd_8: 138 add x5, x0, x2 //pu1_src_tmp += src_strd 139 ld1 {v0.8b}, [x0], #8 //vld1_u8(pu1_src_tmp) 140 add x6, x1, x3 //pu1_dst_tmp += dst_strd 141 st1 {v0.8b}, [x1], #8 //vst1_u8(pu1_dst_tmp, tmp_src) 142 ld1 {v1.8b}, [x5], x2 //vld1_u8(pu1_src_tmp) 143 st1 {v1.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) 144 subs x4, x4, #8 //wd - 8(Loop condition) 145 ld1 {v2.8b}, [x5], x2 //vld1_u8(pu1_src_tmp) 146 st1 {v2.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) 147 ld1 {v3.8b}, [x5], x2 //vld1_u8(pu1_src_tmp) 148 st1 {v3.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) 149 bgt inner_loop_wd_8 150 151end_inner_loop_wd_8: 152 subs x7, x7, #4 //ht -= 4 153 sub x0, x5, x11 //pu1_src = pu1_src_tmp 154 sub x1, x6, x11 //pu1_dst = pu1_dst_tmp 155 bgt outer_loop_wd_8 156 157 // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 158 ldp x19, x20, [sp], #16 159 pop_v_regs 160 ret 161 162core_loop_wd_16: 163 sub x11, x12, #16 164 165outer_loop_wd_16: 166 subs x4, x12, #0 //checks wd 167 ble end_inner_loop_wd_16 168 169inner_loop_wd_16: 170 add x5, x0, x2 //pu1_src_tmp += src_strd 171 ld1 { v0.16b}, [x0], #16 //vld1_u8(pu1_src_tmp) 172 add x6, x1, x3 //pu1_dst_tmp += dst_strd 173 st1 { v0.16b}, [x1], #16 //vst1_u8(pu1_dst_tmp, tmp_src) 174 ld1 { v2.16b}, [x5], x2 //vld1_u8(pu1_src_tmp) 175 st1 { v2.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) 176 subs x4, x4, #16 //wd - 8(Loop condition) 177 ld1 { v4.16b}, [x5], x2 //vld1_u8(pu1_src_tmp) 178 st1 { v4.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) 179 ld1 { v6.16b}, [x5], x2 //vld1_u8(pu1_src_tmp) 180 st1 { v6.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) 181 bgt inner_loop_wd_16 182 183end_inner_loop_wd_16: 184 subs x7, x7, #4 //ht -= 4 185 sub x0, x5, x11 //pu1_src = pu1_src_tmp 186 sub x1, x6, x11 //pu1_dst = pu1_dst_tmp 187 bgt outer_loop_wd_16 188 189 190 ldp x19, x20, [sp], #16 191 pop_v_regs 192 ret 193 194 195// /* 196// ******************************************************************************** 197// * 198// * @brief This function copies a 4x4 block to destination 199// * 200// * @par Description: 201// * Copies a 4x4 block to destination, where both src and dst are interleaved 202// * 203// * @param[in] pi2_src 204// * Source 205// * 206// * @param[in] pu1_out 207// * Output pointer 208// * 209// * @param[in] pred_strd, 210// * Prediction buffer stride 211// * 212// * @param[in] out_strd 213// * output buffer buffer Stride 214// * 215// * @returns none 216// * 217// * @remarks none 218// * Currently wd and height is not used, ie a 4x4 block is always copied 219// * 220// ******************************************************************************* 221// */ 222// void ih264_interleave_copy(WORD16 *pi2_src, 223// UWORD8 *pu1_out, 224// WORD32 pred_strd, 225// WORD32 out_strd 226// WORD32 wd 227// WORD32 ht) 228// Register Usage 229// x0 : pi2_src 230// x1 : pu1_out 231// x2 : src_strd 232// x3 : out_strd 233// Neon registers d0-d7, d16-d30 are used 234// No need for pushing arm and neon registers 235 236 .global ih264_interleave_copy_av8 237ih264_interleave_copy_av8: 238 push_v_regs 239 ld1 {v2.8b}, [x0], x2 //load src plane 1 => d2 &pred palne 2 => d3 240 ld1 {v3.8b}, [x0], x2 241 mov v2.d[1], v3.d[0] 242 ld1 {v4.8b}, [x0], x2 243 ld1 {v5.8b}, [x0], x2 244 mov v4.d[1], v5.d[0] 245 246 mov x0, x1 247 248 ld1 {v18.8b}, [x1], x3 //load out [8 bit size) -8 coeffs 249 ld1 {v19.8b}, [x1], x3 250 mov v18.d[1], v19.d[0] 251 movi v30.8h, #0x00ff 252 ld1 {v20.8b}, [x1], x3 253 ld1 {v21.8b}, [x1], x3 254 mov v20.d[1], v21.d[0] 255 256 bit v18.16b, v2.16b , v30.16b 257 bit v20.16b, v4.16b , v30.16b 258 259 st1 {v18.8b}, [x0], x3 //store out 260 st1 {v18.d}[1], [x0], x3 261 st1 {v20.8b}, [x0], x3 262 st1 {v20.d}[1], [x0], x3 263 264 pop_v_regs 265 ret 266 267 268