ih264_inter_pred_chroma_a9q.s revision a2b49e5f0574dee76f81507f288143d83a4b7c1a
1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2015 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20@/** 21@****************************************************************************** 22@* @file 23@* ih264_inter_pred_chroma_a9q.s 24@* 25@* @brief 26@* Contains function definitions for inter prediction interpolation. 27@* 28@* @author 29@* Ittaim 30@* 31@* @par List of Functions: 32@* 33@* - ih264_inter_pred_chroma_a9q() 34@* 35@* @remarks 36@* None 37@* 38@******************************************************************************* 39@*/ 40 41@/* All the functions here are replicated from ih264_inter_pred_filters.c 42@ 43 44@/** 45@/** 46@/** 47@ 48@/** 49@******************************************************************************* 50@* 51@* @brief 52@* Interprediction chroma filter 53@* 54@* @par Description: 55@* Applies filtering to chroma samples as mentioned in 56@* sec 8.4.2.2.2 titled "chroma sample interpolation process" 57@* 58@* @param[in] pu1_src 59@* UWORD8 pointer to the source containing alternate U and V samples 60@* 61@* @param[out] pu1_dst 62@* UWORD8 pointer to the destination 63@* 64@* @param[in] src_strd 65@* integer source stride 66@* 67@* @param[in] dst_strd 68@* integer destination stride 69@* 70@* @param[in]uc_dx 71@* dx value where the sample is to be produced(refer sec 8.4.2.2.2 ) 72@* 73@* @param[in] uc_dy 74@* dy value where the sample is to be produced(refer sec 8.4.2.2.2 ) 75@* 76@* @param[in] ht 77@* integer height of the array 78@* 79@* @param[in] wd 80@* integer width of the array 81@* 82@* @returns 83@* 84@* @remarks 85@* None 86@* 87@******************************************************************************* 88@*/ 89 90@void ih264_inter_pred_chroma(UWORD8 *pu1_src, 91@ UWORD8 *pu1_dst, 92@ WORD32 src_strd, 93@ WORD32 dst_strd, 94@ UWORD8 u1_dx, 95@ UWORD8 u1_dy, 96@ WORD32 ht, 97@ WORD32 wd) 98@**************Variables Vs Registers***************************************** 99@ r0 => *pu1_src 100@ r1 => *pu1_dst 101@ r2 => src_strd 102@ r3 => dst_strd 103@ r4 => u1_dx 104@ r5 => u1_dy 105@ r6 => height 106@ r7 => width 107@ 108.text 109.p2align 2 110 111 .global ih264_inter_pred_chroma_a9q 112 113ih264_inter_pred_chroma_a9q: 114 115 116 117 stmfd sp!, {r4-r12, r14} @store register values to stack 118 vstmdb sp!, {d8-d15} @push neon registers to stack 119 ldr r4, [sp, #104] 120 ldr r5, [sp, #108] 121 ldr r6, [sp, #112] 122 ldr r7, [sp, #116] 123 124 rsb r8, r4, #8 @8-u1_dx 125 rsb r9, r5, #8 @8-u1_dy 126 mul r10, r8, r9 127 mul r11, r4, r9 128 129 vdup.u8 d28, r10 130 vdup.u8 d29, r11 131 132 mul r10, r8, r5 133 mul r11, r4, r5 134 135 vdup.u8 d30, r10 136 vdup.u8 d31, r11 137 138 subs r12, r7, #2 @if wd=4 branch to loop_4 139 beq loop_2 140 subs r12, r7, #4 @if wd=8 branch to loop_8 141 beq loop_4 142 143loop_8: 144 sub r6, #1 145 vld1.8 {d0, d1, d2}, [r0], r2 @ Load row0 146 vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1 147 vext.8 d3, d0, d1, #2 148 vext.8 d8, d5, d6, #2 149 150 vmull.u8 q5, d0, d28 151 vmlal.u8 q5, d5, d30 152 vmlal.u8 q5, d3, d29 153 vmlal.u8 q5, d8, d31 154 vext.8 d9, d6, d7, #2 155 vext.8 d4, d1, d2, #2 156 157inner_loop_8: 158 vmull.u8 q6, d6, d30 159 vmlal.u8 q6, d1, d28 160 vmlal.u8 q6, d9, d31 161 vmlal.u8 q6, d4, d29 162 vmov d0, d5 163 vmov d3, d8 164 165 vqrshrun.s16 d14, q5, #6 166 vmov d1, d6 167 vmov d4, d9 168 169 vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1 170 vqrshrun.s16 d15, q6, #6 171 172 vext.8 d8, d5, d6, #2 173 subs r6, #1 174 vext.8 d9, d6, d7, #2 175 vst1.8 {q7}, [r1], r3 @ Store dest row 176 177 vmull.u8 q5, d0, d28 178 vmlal.u8 q5, d5, d30 179 vmlal.u8 q5, d3, d29 180 vmlal.u8 q5, d8, d31 181 bne inner_loop_8 182 183 vmull.u8 q6, d6, d30 184 vmlal.u8 q6, d1, d28 185 vmlal.u8 q6, d9, d31 186 vmlal.u8 q6, d4, d29 187 188 vqrshrun.s16 d14, q5, #6 189 vqrshrun.s16 d15, q6, #6 190 191 vst1.8 {q7}, [r1], r3 @ Store dest row 192 193 b end_func 194 195loop_4: 196 sub r6, #1 197 vld1.8 {d0, d1}, [r0], r2 @ Load row0 198 vld1.8 {d2, d3}, [r0], r2 @ Load row1 199 vext.8 d1, d0, d1, #2 200 vext.8 d3, d2, d3, #2 201 202 vmull.u8 q2, d2, d30 203 vmlal.u8 q2, d0, d28 204 vmlal.u8 q2, d3, d31 205 vmlal.u8 q2, d1, d29 206 207inner_loop_4: 208 subs r6, #1 209 vmov d0, d2 210 vmov d1, d3 211 212 vld1.8 {d2, d3}, [r0], r2 @ Load row1 213 vqrshrun.s16 d6, q2, #6 214 215 vext.8 d3, d2, d3, #2 216 vst1.8 {d6}, [r1], r3 @ Store dest row 217 218 vmull.u8 q2, d0, d28 219 vmlal.u8 q2, d2, d30 220 vmlal.u8 q2, d1, d29 221 vmlal.u8 q2, d3, d31 222 bne inner_loop_4 223 224 vqrshrun.s16 d6, q2, #6 225 vst1.8 {d6}, [r1], r3 @ Store dest row 226 227 b end_func 228 229loop_2: 230 vld1.8 {d0}, [r0], r2 @ Load row0 231 vext.8 d1, d0, d0, #2 232 vld1.8 {d2}, [r0], r2 @ Load row1 233 vext.8 d3, d2, d2, #2 234 vmull.u8 q2, d0, d28 235 vmlal.u8 q2, d1, d29 236 vmlal.u8 q2, d2, d30 237 vmlal.u8 q2, d3, d31 238 vld1.8 {d6}, [r0] @ Load row2 239 vqrshrun.s16 d4, q2, #6 240 vext.8 d7, d6, d6, #2 241 vst1.32 d4[0], [r1], r3 @ Store dest row0 242 vmull.u8 q4, d2, d28 243 vmlal.u8 q4, d3, d29 244 vmlal.u8 q4, d6, d30 245 vmlal.u8 q4, d7, d31 246 subs r6, #2 247 vqrshrun.s16 d8, q4, #6 248 vst1.32 d8[0], [r1], r3 @ Store dest row1 249 bne loop_2 @ repeat if ht=2 250 251end_func: 252 vldmia sp!, {d8-d15} @ Restore neon registers that were saved 253 ldmfd sp!, {r4-r12, pc} @ Restoring registers from stack 254 255