armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 12290 21;// Date: Wednesday, April 9, 2008 22;// 23;// 24;// 25;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS CortexA8 31 32 EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 33 34 IF CortexA8 35 36 M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r11 37 38;// Declare input registers 39pSrc RN 0 40srcStep RN 1 41pDst RN 2 42dstStep RN 3 43 44Temp RN 12 45 46;// Declare Neon registers 47dCoeff5 DN 30.S16 48dCoeff20 DN 31.S16 49 50dSrc0 DN 7.U8 51dSrc1 DN 8.U8 52dSrc2 DN 9.U8 53dSrc3 DN 10.U8 54dSrc4 DN 11.U8 55dSrc5 DN 12.U8 56dSrc6 DN 13.U8 57dSrc7 DN 14.U8 58dSrc8 DN 15.U8 59 60qSumBE01 QN 8.S16 61qSumCD01 QN 9.S16 62dSumBE0 DN 16.S16 63dSumCD0 DN 18.S16 64 65qAcc01 QN 0.S16 66qAcc23 QN 1.S16 67qAcc45 QN 2.S16 68qAcc67 QN 3.S16 69 70dRes0 DN 0.S16 71dRes1 DN 2.S16 72dRes2 DN 4.S16 73dRes3 DN 6.S16 74 75dAcc0 DN 0.U8 76dAcc1 DN 2.U8 77dAcc2 DN 4.U8 78dAcc3 DN 6.U8 79 80 81dTmp0 DN 20.S16 82dTmp1 DN 21.S16 83dTmp2 DN 22.S16 84dTmp3 DN 23.S16 85 86 87 VLD1 dSrc0, [pSrc], srcStep ;// [a0 a1 a2 a3 .. ] 88 ADD Temp, pSrc, srcStep, LSL #2 89 VLD1 dSrc1, [pSrc], srcStep ;// [b0 b1 b2 b3 .. ] 90 ;// One cycle stall 91 VLD1 dSrc5, [Temp], srcStep 92 ;// One cycle stall 93 VLD1 dSrc2, [pSrc], srcStep ;// [c0 c1 c2 c3 .. ] 94 VADDL qAcc01, dSrc0, dSrc5 ;// Acc = a+f 95 VLD1 dSrc3, [pSrc], srcStep 96 ;// One cycle stall 97 VLD1 dSrc6, [Temp], srcStep ;// TeRi 98 99 VLD1 dSrc4, [pSrc], srcStep 100 VLD1 dSrc7, [Temp], srcStep ;// TeRi 101 VADDL qSumBE01, dSrc1, dSrc4 ;// b+e 102 VADDL qSumCD01, dSrc2, dSrc3 ;// c+d 103 VLD1 dSrc8, [Temp], srcStep ;// TeRi 104 VMLS dRes0, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 105; VMLA dRes0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 106 VMUL dTmp0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 107 108; VLD1 dSrc6, [Temp], srcStep 109 VADDL qSumBE01, dSrc2, dSrc5 ;// b+e 110 VADDL qSumCD01, dSrc3, dSrc4 ;// c+d 111 VADDL qAcc23, dSrc1, dSrc6 ;// Acc = a+f 112 VMLS dRes1, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 113; VMLA dRes1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 114 VMUL dTmp1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 115 116; VLD1 dSrc7, [Temp], srcStep 117 VADDL qSumBE01, dSrc3, dSrc6 ;// b+e 118 VADDL qSumCD01, dSrc4, dSrc5 ;// c+d 119 VADDL qAcc45, dSrc2, dSrc7 ;// Acc = a+f 120 VMLS dRes2, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 121; VMLA dRes2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 122 VMUL dTmp2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 123 124; VLD1 dSrc8, [Temp], srcStep ;// [i0 i1 i2 i3 .. ] 125 VADDL qSumBE01, dSrc4, dSrc7 ;// b+e 126 VADDL qAcc67, dSrc3, dSrc8 ;// Acc = a+f 127 VADDL qSumCD01, dSrc5, dSrc6 ;// c+d 128 VMLS dRes3, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 129 VADD dRes0, dRes0, dTmp0 130 VADD dRes1, dRes1, dTmp1 131 VADD dRes2, dRes2, dTmp2 132 VMLA dRes3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 133; VMUL dTmp3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 134; VADD dRes3, dRes3, dTmp3 135 136 VQRSHRUN dAcc0, qAcc01, #5 137 VQRSHRUN dAcc1, qAcc23, #5 138 VQRSHRUN dAcc2, qAcc45, #5 139 VQRSHRUN dAcc3, qAcc67, #5 140 141 M_END 142 143 ENDIF 144 145 146 147 END 148 149