1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_inter_pred_luma_horz_av8.s 24//* 25//* @brief 26//* Contains function definitions for inter prediction interpolation. 27//* 28//* @author 29//* Ittiam 30//* 31//* @par List of Functions: 32//* 33//* - ih264_inter_pred_luma_horz_av8() 34//* 35//* @remarks 36//* None 37//* 38//******************************************************************************* 39//*/ 40 41///* All the functions here are replicated from ih264_inter_pred_filters.c 42// 43 44///** 45///** 46//******************************************************************************* 47//* 48//* @brief 49//* Interprediction luma filter for horizontal input 50//* 51//* @par Description: 52//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits 53//* sec 8.4.2.2.1 titled "Luma sample interpolation process" 54//* 55//* @param[in] pu1_src 56//* UWORD8 pointer to the source 57//* 58//* @param[out] pu1_dst 59//* UWORD8 pointer to the destination 60//* 61//* @param[in] src_strd 62//* integer source stride 63//* 64//* @param[in] dst_strd 65//* integer destination stride 66//* 67//* @param[in] ht 68//* integer height of the array 69//* 70//* @param[in] wd 71//* integer width of the array 72//* 73//* @returns 74//* 75// @remarks 76//* None 77//* 78//******************************************************************************* 79//*/ 80 81//void ih264_inter_pred_luma_horz ( 82// UWORD8 *pu1_src, 83// UWORD8 *pu1_dst, 84// WORD32 src_strd, 85// WORD32 dst_strd, 86// WORD32 ht, 87// WORD32 wd ) 88 89//**************Variables Vs Registers***************************************** 90// x0 => *pu1_src 91// x1 => *pu1_dst 92// w2 => src_strd 93// w3 => dst_strd 94// w4 => ht 95// w5 => wd 96 97.text 98.p2align 2 99 100.include "ih264_neon_macros.s" 101 102 103 104 .global ih264_inter_pred_luma_horz_av8 105 106ih264_inter_pred_luma_horz_av8: 107 108 109 110 111 // STMFD sp!, {x4-x12, x14} //store register values to stack 112 push_v_regs 113 stp x19, x20, [sp, #-16]! 114 sxtw x2, w2 115 sxtw x3, w3 116 sxtw x4, w4 117 sxtw x5, w5 118 sub x0, x0, #2 //pu1_src-2 119 sub x14, x4, #16 120 movi v0.8b, #5 //filter coeff 121 subs x12, x5, #8 //if wd=8 branch to loop_8 122 movi v1.8b, #20 //filter coeff 123 beq loop_8 124 125 subs x12, x5, #4 //if wd=4 branch to loop_4 126 beq loop_4 127 128loop_16: //when wd=16 129 //// Processing row0 and row1 130 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0 131 add x14, x14, #1 //for checking loop 132 ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0) 133 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1 134 ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row0) 135 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) 136 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1) 137 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0) 138 ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row1) 139 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) 140 ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0) 141 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1) 142 ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row0) 143 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 144 ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1) 145 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 146 ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row1) 147 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) 148 ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0) 149 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1) 150 ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row0) 151 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 152 ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1) 153 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 154 ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row1) 155 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) 156 ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0) 157 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1) 158 ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row0) 159 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 160 ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1) 161 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 162 ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row1) 163 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 164 ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0) 165 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) 166 ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row0) 167 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 168 ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1) 169 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 170 ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row1) 171 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 172 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2 173 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) 174 175 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 176 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3 177 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 178 ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2) 179 st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row0 180 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 181 ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row2) 182 sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) 183 184 185 186//// Processing row2 and row3 187 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3) 188 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) 189 st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row1 190 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row2) 191 ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row3) 192 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) 193 ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2) 194 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row3) 195 ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row2) 196 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) 197 ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row3) 198 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row2) 199 ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3) 200 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) 201 ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2) 202 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row3) 203 ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row2) 204 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) 205 ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row3) 206 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row2) 207 ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row3) 208 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) 209 ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2) 210 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row3) 211 ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row2) 212 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) 213 ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3) 214 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row2) 215 ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row3) 216 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) 217 ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2) 218 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row3) 219 ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row2) 220 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) 221 ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3) 222 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row2) 223 ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row3) 224 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) 225 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4 226 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row3) 227 228 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) 229 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5 230 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row2) 231 ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row4) 232 st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2 233 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) 234 ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row4) 235 sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row3) 236 237 238//// Processing row4 and row5 239 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row5) 240 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) 241 st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row3 242 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row4) 243 ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row5) 244 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) 245 ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row4) 246 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row5) 247 ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row4) 248 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) 249 ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row5) 250 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row4) 251 ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row5) 252 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) 253 ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row4) 254 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row5) 255 ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row4) 256 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) 257 ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row5) 258 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row4) 259 ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row5) 260 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) 261 ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row4) 262 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row5) 263 ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row4) 264 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) 265 ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row5) 266 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row4) 267 ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row5) 268 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) 269 ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row4) 270 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row5) 271 ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row4) 272 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) 273 ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row5) 274 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row4) 275 ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row5) 276 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) 277 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6 278 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row5) 279 280 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) 281 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7 282 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row4) 283 ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row6) 284 st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2 285 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) 286 ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row6) 287 sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row5) 288 289 290 291 //// Processing row6 and row7 292 293 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row7) 294 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) 295 st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row5 296 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row6) 297 ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row7) 298 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) 299 ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row6) 300 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row7) 301 ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row6) 302 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) 303 ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row7) 304 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row6) 305 ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row7) 306 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) 307 ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row6) 308 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row7) 309 ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row6) 310 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) 311 ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row7) 312 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row6) 313 ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row7) 314 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) 315 ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row6) 316 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row7) 317 ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row6) 318 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) 319 ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row7) 320 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row6) 321 ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row7) 322 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) 323 ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row6) 324 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row7) 325 ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row6) 326 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) 327 ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row7) 328 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row6) 329 ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row6) 330 331 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) 332 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) 333 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row6) 334 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row7) 335 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) 336 st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row6 337 sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row7) 338 subs x12, x14, #1 // if height==16 - looping 339 st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row7 340 341 342 343 beq loop_16 344 b end_func 345 346 347 348loop_8: 349//// Processing row0 and row1 350 351 352 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 353 add x14, x14, #1 //for checking loop 354 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1) 355 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 356 ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1) 357 ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0) 358 ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1) 359 ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1) 360 ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1) 361 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) 362 ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0) 363 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) 364 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) 365 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 366 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 367 ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0) 368 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) 369 ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0) 370 ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0) 371 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 372 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 373 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 374 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 375 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 376 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 377 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 378 379 //// Processing row2 and row3 380 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3) 381 ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3) 382 ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2) 383 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) 384 st1 {v23.8b}, [x1], x3 ////Store dest row0 385 ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row2) 386 ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3) 387 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 388 ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3) 389 ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2) 390 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) 391 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) 392 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) 393 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) 394 st1 {v20.8b}, [x1], x3 ////Store dest row1 395 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) 396 ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2) 397 ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2) 398 ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2) 399 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row4 400 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) 401 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) 402 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) 403 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) 404 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 405 subs x9, x4, #4 406 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) 407 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row5) 408 ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row5) 409 ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row4) 410 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) 411 ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row5) 412 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) 413 ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row5) 414 ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row4) 415 st1 {v20.8b}, [x1], x3 ////Store dest row2 416 ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row4) 417 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) 418 st1 {v23.8b}, [x1], x3 ////Store dest row3 419 beq end_func // Branch if height==4 420 421//// Processing row4 and row5 422 ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row5) 423 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) 424 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) 425 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row5) 426 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) 427 ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row4) 428 ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row4) 429 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row6 430 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) 431 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) 432 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) 433 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) 434 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) 435 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row7 436 ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row6) 437 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row7) 438 ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row7) 439 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) 440 ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row7) 441 ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row7) 442 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) 443 ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row6) 444 ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row6) 445 st1 {v20.8b}, [x1], x3 ////Store dest row4 446 ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row6) 447 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) 448 ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row6) 449 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) 450 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) 451 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) 452 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) 453 //// Processing row6 and row7 454 st1 {v23.8b}, [x1], x3 ////Store dest row5 455 ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row7) 456 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) 457 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) 458 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row7) 459 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) 460 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) 461 subs x12, x14, #1 462 st1 {v20.8b}, [x1], x3 ////Store dest row6 463 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) 464 st1 {v23.8b}, [x1], x3 ////Store dest row7 465 466 beq loop_8 //looping if height ==16 467 468 b end_func 469loop_4: 470 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 471 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1) 472 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 473 ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1) 474 ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0) 475 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) 476 ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1) 477 ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1) 478 ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1) 479 ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0) 480 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) 481 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) 482 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 483 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 484 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) 485 ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0) 486 ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0) 487 ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0) 488 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 489 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 490 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 491 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 492 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 493 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 494 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3) 495 ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3) 496 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 497 ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2) 498 ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row2) 499 st1 {v23.s}[0], [x1], x3 ////Store dest row0 500 ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3) 501 ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3) 502 ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2) 503 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 504 ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2) 505 ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2) 506 507 //// Processing row2 and row3 508 st1 {v20.s}[0], [x1], x3 ////Store dest row1 509 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) 510 ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2) 511 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) 512 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) 513 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) 514 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) 515 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) 516 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) 517 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) 518 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) 519 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) 520 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) 521 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) 522 st1 {v20.s}[0], [x1], x3 ////Store dest row2 523 subs x4, x4, #8 // Loop if height =8 524 st1 {v23.s}[0], [x1], x3 ////Store dest row3 525 beq loop_4 526 527end_func: 528 // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack 529 ldp x19, x20, [sp], #16 530 pop_v_regs 531 ret 532 533 534 535