armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 12290 21;// Date: Wednesday, April 9, 2008 22;// 23;// 24;// 25;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 31 32 M_VARIANTS CortexA8 33 34 IF CortexA8 35 36 M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11 37 38;// Declare input registers 39pSrc RN 0 40srcStep RN 1 41pDst RN 2 42dstStep RN 3 43 44;// Declare Neon registers 45dCoeff5 DN 30.S16 46dCoeff20 DN 31.S16 47qCoeff5 QN 14.S32 48qCoeff20 QN 15.S32 49 50qSrc01 QN 0.U8 51dSrc0 DN 0.U8 52dSrc1 DN 1.U8 53 54dSrcb DN 4.U8 55dSrcc DN 2.U8 56dSrcd DN 3.U8 57dSrce DN 5.U8 58dSrcf DN 1.U8 59 60qSrcb QN 2.S16 61qSrcc QN 1.S16 62dSrcB DN 4.S16 63dSrcC DN 2.S16 64 65qRes0 QN 5.S16 66qRes1 QN 6.S16 67qRes2 QN 7.S16 68qRes3 QN 8.S16 69qRes4 QN 9.S16 70qRes5 QN 10.S16 71qRes6 QN 11.S16 72qRes7 QN 12.S16 73qRes8 QN 13.S16 74 75dRes0 DN 10.S16 76dRes1 DN 12.S16 77dRes2 DN 14.S16 78dRes3 DN 16.S16 79dRes4 DN 18.S16 80dRes5 DN 20.S16 81dRes6 DN 22.S16 82dRes7 DN 24.S16 83dRes8 DN 26.S16 84 85qAcc01 QN 5.S32 86qAcc23 QN 6.S32 87qAcc45 QN 2.S32 88qAcc67 QN 3.S32 89qSumBE QN 0.S32 90qSumCD QN 1.S32 91 92dTempAcc0 DN 0.U16 93dTempAcc1 DN 2.U16 94dTempAcc2 DN 4.U16 95dTempAcc3 DN 6.U16 96 97qTAcc0 QN 0.U16 98qTAcc1 QN 1.U16 99qTAcc2 QN 2.U16 100qTAcc3 QN 3.U16 101 102dAcc0 DN 0.U8 103dAcc1 DN 2.U8 104dAcc2 DN 4.U8 105dAcc3 DN 6.U8 106 107dTmp0 DN 8.S16 108dTmp1 DN 9.S16 109qTmp0 QN 4.S32 110 111 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 112 VMOV dCoeff20, #20 113 VMOV dCoeff5, #5 114 115 ;// Row0 116 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 117 VEXT dSrcc, dSrc0, dSrc1, #2 118 VEXT dSrcd, dSrc0, dSrc1, #3 119 VEXT dSrce, dSrc0, dSrc1, #4 120 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 121 VADDL qSrcc, dSrcc, dSrcd ;// c+d 122 VADDL qSrcb, dSrcb, dSrce ;// b+e 123 VADDL qRes0, dSrc0, dSrcf ;// Acc=a+f 124 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 125 VMLA dRes0, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 126; VMLS dRes0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 127 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 128 129 ;// Row1 130 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 131 VEXT dSrcc, dSrc0, dSrc1, #2 132 VEXT dSrcd, dSrc0, dSrc1, #3 133 VEXT dSrce, dSrc0, dSrc1, #4 134 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 135 VADDL qSrcc, dSrcc, dSrcd ;// c+d 136 VADDL qSrcb, dSrcb, dSrce ;// b+e 137 VADDL qRes1, dSrc0, dSrcf ;// Acc=a+f 138 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 139 140 VSUB dRes0, dRes0, dTmp0 ;// TeRi 141 142 VMLA dRes1, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 143; VMLS dRes1, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 144 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 145 146 ;// Row2 147 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 148 VEXT dSrcc, dSrc0, dSrc1, #2 149 VEXT dSrcd, dSrc0, dSrc1, #3 150 VEXT dSrce, dSrc0, dSrc1, #4 151 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 152 VADDL qSrcc, dSrcc, dSrcd ;// c+d 153 VADDL qSrcb, dSrcb, dSrce ;// b+e 154 VADDL qRes2, dSrc0, dSrcf ;// Acc=a+f 155 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 156 157 VSUB dRes1, dRes1, dTmp0 158 159 VMLA dRes2, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 160; VMLS dRes2, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 161 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 162 163 ;// Row3 164 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 165 VEXT dSrcc, dSrc0, dSrc1, #2 166 VEXT dSrcd, dSrc0, dSrc1, #3 167 VEXT dSrce, dSrc0, dSrc1, #4 168 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 169 VADDL qSrcc, dSrcc, dSrcd ;// c+d 170 VADDL qSrcb, dSrcb, dSrce ;// b+e 171 VADDL qRes3, dSrc0, dSrcf ;// Acc=a+f 172 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 173 174 VSUB dRes2, dRes2, dTmp0 175 176 VMLA dRes3, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 177; VMLS dRes3, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 178 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 179 180 ;// Row4 181 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 182 VEXT dSrcc, dSrc0, dSrc1, #2 183 VEXT dSrcd, dSrc0, dSrc1, #3 184 VEXT dSrce, dSrc0, dSrc1, #4 185 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 186 VADDL qSrcc, dSrcc, dSrcd ;// c+d 187 VADDL qSrcb, dSrcb, dSrce ;// b+e 188 VADDL qRes4, dSrc0, dSrcf ;// Acc=a+f 189 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 190 191 VSUB dRes3, dRes3, dTmp0 192 193 VMLA dRes4, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 194; VMLS dRes4, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 195 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 196 197 ;// Row5 198 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 199 VEXT dSrcc, dSrc0, dSrc1, #2 200 VEXT dSrcd, dSrc0, dSrc1, #3 201 VEXT dSrce, dSrc0, dSrc1, #4 202 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 203 VADDL qSrcc, dSrcc, dSrcd ;// c+d 204 VADDL qSrcb, dSrcb, dSrce ;// b+e 205 VADDL qRes5, dSrc0, dSrcf ;// Acc=a+f 206 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 207 208 VSUB dRes4, dRes4, dTmp0 209 210 VMLA dRes5, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 211; VMLS dRes5, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 212 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 213 214 ;// Row6 215 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 216 VEXT dSrcc, dSrc0, dSrc1, #2 217 VEXT dSrcd, dSrc0, dSrc1, #3 218 VEXT dSrce, dSrc0, dSrc1, #4 219 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 220 VADDL qSrcc, dSrcc, dSrcd ;// c+d 221 VADDL qSrcb, dSrcb, dSrce ;// b+e 222 VADDL qRes6, dSrc0, dSrcf ;// Acc=a+f 223 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 224 225 VSUB dRes5, dRes5, dTmp0 226 227 VMLA dRes6, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 228; VMLS dRes6, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 229 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 230 231 ;// Row7 232 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 233 VEXT dSrcc, dSrc0, dSrc1, #2 234 VEXT dSrcd, dSrc0, dSrc1, #3 235 VEXT dSrce, dSrc0, dSrc1, #4 236 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 237 VADDL qSrcc, dSrcc, dSrcd ;// c+d 238 VADDL qSrcb, dSrcb, dSrce ;// b+e 239 VADDL qRes7, dSrc0, dSrcf ;// Acc=a+f 240 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 241 242 VSUB dRes6, dRes6, dTmp0 243 244 VMLA dRes7, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 245; VMLS dRes7, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 246 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 247 248 ;// Row8 249 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 250 VEXT dSrcc, dSrc0, dSrc1, #2 251 VEXT dSrcd, dSrc0, dSrc1, #3 252 VEXT dSrce, dSrc0, dSrc1, #4 253 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 254 VADDL qSrcc, dSrcc, dSrcd ;// c+d 255 VADDL qSrcb, dSrcb, dSrce ;// b+e 256 VADDL qRes8, dSrc0, dSrcf ;// Acc=a+f 257 258 VSUB dRes7, dRes7, dTmp0 259 260 VMLA dRes8, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 261; VMLS dRes8, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 262 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 263 264 VMOV qCoeff20, #20 265 VMOV qCoeff5, #5 266 267 ;// Col0 268 VADDL qAcc01, dRes0, dRes5 ;// Acc = a+f 269 VADDL qSumCD, dRes2, dRes3 ;// c+d 270 VADDL qSumBE, dRes1, dRes4 ;// b+e 271 272 VSUB dRes8, dRes8, dTmp0 273 274 VMLA qAcc01, qSumCD, qCoeff20 ;// Acc += 20*(c+d) 275; VMLS qAcc01, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 276 VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 277 278 ;// Col1 279 VADDL qAcc23, dRes1, dRes6 ;// Acc = a+f 280 VADDL qSumCD, dRes3, dRes4 ;// c+d 281 VADDL qSumBE, dRes2, dRes5 ;// b+e 282 VMLA qAcc23, qSumCD, qCoeff20 ;// Acc += 20*(c+d) 283 284 VSUB qAcc01, qAcc01, qTmp0 285 286; VMLS qAcc23, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 287 VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 288 289 ;// Col2 290 VADDL qAcc45, dRes2, dRes7 ;// Acc = a+f 291 VADDL qSumCD, dRes4, dRes5 ;// c+d 292 VADDL qSumBE, dRes3, dRes6 ;// b+e 293 VMLA qAcc45, qSumCD, qCoeff20 ;// Acc += 20*(c+d) 294 295 VSUB qAcc23, qAcc23, qTmp0 296 297; VMLS qAcc45, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 298 VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 299 300 ;// Col3 301 VADDL qAcc67, dRes3, dRes8 ;// Acc = a+f 302 VADDL qSumCD, dRes5, dRes6 ;// c+d 303 VADDL qSumBE, dRes4, dRes7 ;// b+e 304 VMLA qAcc67, qSumCD, qCoeff20 ;// Acc += 20*(c+d) 305 306 VSUB qAcc45, qAcc45, qTmp0 307 308 VMLS qAcc67, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 309 310 VQRSHRUN dTempAcc0, qAcc01, #10 311 VQRSHRUN dTempAcc1, qAcc23, #10 312 VQRSHRUN dTempAcc2, qAcc45, #10 313 VQRSHRUN dTempAcc3, qAcc67, #10 314 315 VQMOVN dAcc0, qTAcc0 316 VQMOVN dAcc1, qTAcc1 317 VQMOVN dAcc2, qTAcc2 318 VQMOVN dAcc3, qTAcc3 319 320 M_END 321 322 ENDIF 323 324 325 326 END 327 328