1;// 2;// 3;// File Name: armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 12290 6;// Date: Wednesday, April 9, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS CortexA8 17 18 EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 19 20DEBUG_ON SETL {FALSE} 21 22 IF CortexA8 23 24 M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r11 25 26;// Declare input registers 27pSrc RN 0 28srcStep RN 1 29pDst RN 2 30dstStep RN 3 31 32;// Declare Neon registers 33dCoeff5 DN 30.S16 34dCoeff20 DN 31.S16 35 36qSrcA01 QN 11.U8 37qSrcB01 QN 12.U8 38qSrcC01 QN 13.U8 39qSrcD01 QN 14.U8 40 41dSrcA0 DN 22.U8 42dSrcA1 DN 23.U8 43dSrcB0 DN 24.U8 44dSrcB1 DN 25.U8 45dSrcC0 DN 26.U8 46dSrcC1 DN 27.U8 47dSrcD0 DN 28.U8 48dSrcD1 DN 29.U8 49 50dSrcb DN 12.U8 51dSrce DN 13.U8 52dSrcf DN 10.U8 53 54dSrc0c DN 14.U8 55dSrc1c DN 16.U8 56dSrc2c DN 18.U8 57dSrc3c DN 20.U8 58 59dSrc0d DN 15.U8 60dSrc1d DN 17.U8 61dSrc2d DN 19.U8 62dSrc3d DN 21.U8 63 64qTemp01 QN 4.S16 65qTemp23 QN 6.S16 66dTemp0 DN 8.S16 67dTemp2 DN 12.S16 68 69qRes01 QN 11.S16 70qRes23 QN 12.S16 71qRes45 QN 13.S16 72qRes67 QN 14.S16 73 74dRes0 DN 22.S16 75dRes2 DN 24.S16 76dRes4 DN 26.S16 77dRes6 DN 28.S16 78 79dAcc0 DN 22.U8 80dAcc2 DN 24.U8 81dAcc4 DN 26.U8 82dAcc6 DN 28.U8 83 84dResult0 DN 22.U32 85dResult2 DN 24.U32 86dResult4 DN 26.U32 87dResult6 DN 28.U32 88 89 VLD1 qSrcA01, [pSrc], srcStep ;// Load A register [a0 a1 a2 a3 ..] 90 ;// One cycle stall 91 VEXT dSrcf, dSrcA0, dSrcA1, #5 ;// [f0 f1 f2 f3 ..] 92 VEXT dSrcb, dSrcA0, dSrcA1, #1 ;// [b0 b1 b2 b3 ..] 93; VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..] 94 VEXT dSrc0c, dSrcA0, dSrcA1, #2 95 VEXT dSrc0d, dSrcA0, dSrcA1, #3 96 VEXT dSrce, dSrcA0, dSrcA1, #4 97 VADDL qRes01, dSrcA0, dSrcf ;// Acc=a+f 98 VADDL qTemp01, dSrc0c, dSrc0d ;// c+d 99 VADDL qTemp23, dSrcb, dSrce ;// b+e 100 101 VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..] 102; VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..] 103 VMLA dRes0, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 104; VMLS dRes0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 105 VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi 106 107 VEXT dSrcf, dSrcB0, dSrcB1, #5 ;// [f0 f1 f2 f3 ..] 108 VEXT dSrcb, dSrcB0, dSrcB1, #1 ;// [b0 b1 b2 b3 ..] 109 VEXT dSrc1c, dSrcB0, dSrcB1, #2 110 VEXT dSrc1d, dSrcB0, dSrcB1, #3 111 VEXT dSrce, dSrcB0, dSrcB1, #4 112 VADDL qRes23, dSrcB0, dSrcf ;// Acc=a+f 113 114 VSUB dRes0, dRes0, dTemp0 ;// TeRi 115 116 VADDL qTemp01, dSrc1c, dSrc1d ;// c+d 117 VADDL qTemp23, dSrcb, dSrce ;// b+e 118 119 VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..] 120; VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..] 121 122 VMLA dRes2, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 123; VMLS dRes2, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 124 VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi 125 126 VEXT dSrcf, dSrcC0, dSrcC1, #5 ;// [f0 f1 f2 f3 ..] 127 VEXT dSrcb, dSrcC0, dSrcC1, #1 ;// [b0 b1 b2 b3 ..] 128 VEXT dSrc2c, dSrcC0, dSrcC1, #2 129 VEXT dSrc2d, dSrcC0, dSrcC1, #3 130 VEXT dSrce, dSrcC0, dSrcC1, #4 131 VADDL qRes45, dSrcC0, dSrcf ;// Acc=a+f 132 133 VSUB dRes2, dRes2, dTemp0 ;// TeRi 134 135 VADDL qTemp01, dSrc2c, dSrc2d ;// c+d 136 VADDL qTemp23, dSrcb, dSrce ;// b+e 137 138 VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..] 139 140 VMLA dRes4, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 141; VMLS dRes4, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 142 VMUL dTemp0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) TeRi 143 144 145 VEXT dSrcf, dSrcD0, dSrcD1, #5 ;// [f0 f1 f2 f3 ..] 146 VEXT dSrcb, dSrcD0, dSrcD1, #1 ;// [b0 b1 b2 b3 ..] 147 VEXT dSrc3c, dSrcD0, dSrcD1, #2 148 VEXT dSrc3d, dSrcD0, dSrcD1, #3 149 VEXT dSrce, dSrcD0, dSrcD1, #4 150 VADDL qRes67, dSrcD0, dSrcf ;// Acc=a+f 151 152 VSUB dRes4, dRes4, dTemp0 ;// TeRi 153 154 VADDL qTemp01, dSrc3c, dSrc3d ;// c+d 155 VADDL qTemp23, dSrcb, dSrce ;// b+e 156 VMLA dRes6, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 157 VMLS dRes6, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 158 159 VQRSHRUN dAcc0, qRes01, #5 ;// Acc = Sat ((Acc + 16) / 32) 160 VQRSHRUN dAcc2, qRes23, #5 ;// Acc = Sat ((Acc + 16) / 32) 161 VQRSHRUN dAcc4, qRes45, #5 ;// Acc = Sat ((Acc + 16) / 32) 162 VQRSHRUN dAcc6, qRes67, #5 ;// Acc = Sat ((Acc + 16) / 32) 163 164 M_END 165 166 ENDIF 167 168 169 END 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229