1;// 2;// 3;// File Name: armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 12290 6;// Date: Wednesday, April 9, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS CortexA8 17 18 EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 19 20 IF CortexA8 21 22 M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r11 23 24;// Declare input registers 25pSrc RN 0 26srcStep RN 1 27pDst RN 2 28dstStep RN 3 29 30Temp RN 12 31 32;// Declare Neon registers 33dCoeff5 DN 30.S16 34dCoeff20 DN 31.S16 35 36dSrc0 DN 7.U8 37dSrc1 DN 8.U8 38dSrc2 DN 9.U8 39dSrc3 DN 10.U8 40dSrc4 DN 11.U8 41dSrc5 DN 12.U8 42dSrc6 DN 13.U8 43dSrc7 DN 14.U8 44dSrc8 DN 15.U8 45 46qSumBE01 QN 8.S16 47qSumCD01 QN 9.S16 48dSumBE0 DN 16.S16 49dSumCD0 DN 18.S16 50 51qAcc01 QN 0.S16 52qAcc23 QN 1.S16 53qAcc45 QN 2.S16 54qAcc67 QN 3.S16 55 56dRes0 DN 0.S16 57dRes1 DN 2.S16 58dRes2 DN 4.S16 59dRes3 DN 6.S16 60 61dAcc0 DN 0.U8 62dAcc1 DN 2.U8 63dAcc2 DN 4.U8 64dAcc3 DN 6.U8 65 66 67dTmp0 DN 20.S16 68dTmp1 DN 21.S16 69dTmp2 DN 22.S16 70dTmp3 DN 23.S16 71 72 73 VLD1 dSrc0, [pSrc], srcStep ;// [a0 a1 a2 a3 .. ] 74 ADD Temp, pSrc, srcStep, LSL #2 75 VLD1 dSrc1, [pSrc], srcStep ;// [b0 b1 b2 b3 .. ] 76 ;// One cycle stall 77 VLD1 dSrc5, [Temp], srcStep 78 ;// One cycle stall 79 VLD1 dSrc2, [pSrc], srcStep ;// [c0 c1 c2 c3 .. ] 80 VADDL qAcc01, dSrc0, dSrc5 ;// Acc = a+f 81 VLD1 dSrc3, [pSrc], srcStep 82 ;// One cycle stall 83 VLD1 dSrc6, [Temp], srcStep ;// TeRi 84 85 VLD1 dSrc4, [pSrc], srcStep 86 VLD1 dSrc7, [Temp], srcStep ;// TeRi 87 VADDL qSumBE01, dSrc1, dSrc4 ;// b+e 88 VADDL qSumCD01, dSrc2, dSrc3 ;// c+d 89 VLD1 dSrc8, [Temp], srcStep ;// TeRi 90 VMLS dRes0, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 91; VMLA dRes0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 92 VMUL dTmp0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 93 94; VLD1 dSrc6, [Temp], srcStep 95 VADDL qSumBE01, dSrc2, dSrc5 ;// b+e 96 VADDL qSumCD01, dSrc3, dSrc4 ;// c+d 97 VADDL qAcc23, dSrc1, dSrc6 ;// Acc = a+f 98 VMLS dRes1, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 99; VMLA dRes1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 100 VMUL dTmp1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 101 102; VLD1 dSrc7, [Temp], srcStep 103 VADDL qSumBE01, dSrc3, dSrc6 ;// b+e 104 VADDL qSumCD01, dSrc4, dSrc5 ;// c+d 105 VADDL qAcc45, dSrc2, dSrc7 ;// Acc = a+f 106 VMLS dRes2, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 107; VMLA dRes2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 108 VMUL dTmp2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 109 110; VLD1 dSrc8, [Temp], srcStep ;// [i0 i1 i2 i3 .. ] 111 VADDL qSumBE01, dSrc4, dSrc7 ;// b+e 112 VADDL qAcc67, dSrc3, dSrc8 ;// Acc = a+f 113 VADDL qSumCD01, dSrc5, dSrc6 ;// c+d 114 VMLS dRes3, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 115 VADD dRes0, dRes0, dTmp0 116 VADD dRes1, dRes1, dTmp1 117 VADD dRes2, dRes2, dTmp2 118 VMLA dRes3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 119; VMUL dTmp3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 120; VADD dRes3, dRes3, dTmp3 121 122 VQRSHRUN dAcc0, qAcc01, #5 123 VQRSHRUN dAcc1, qAcc23, #5 124 VQRSHRUN dAcc2, qAcc45, #5 125 VQRSHRUN dAcc3, qAcc67, #5 126 127 M_END 128 129 ENDIF 130 131 132 133 END 134 135