1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 12290 21;// Date: Wednesday, April 9, 2008 22;// 23;// 24;// 25;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS CortexA8 31 32 EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 33 34DEBUG_ON SETL {FALSE} 35 36 IF CortexA8 37 38 M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r11 39 40;// Declare input registers 41pSrc RN 0 42srcStep RN 1 43pDst RN 2 44dstStep RN 3 45 46;// Declare Neon registers 47dCoeff5 DN 30.S16 48dCoeff20 DN 31.S16 49 50qSrcA01 QN 11.U8 51qSrcB01 QN 12.U8 52qSrcC01 QN 13.U8 53qSrcD01 QN 14.U8 54 55dSrcA0 DN 22.U8 56dSrcA1 DN 23.U8 57dSrcB0 DN 24.U8 58dSrcB1 DN 25.U8 59dSrcC0 DN 26.U8 60dSrcC1 DN 27.U8 61dSrcD0 DN 28.U8 62dSrcD1 DN 29.U8 63 64dSrcb DN 12.U8 65dSrce DN 13.U8 66dSrcf DN 10.U8 67 68dSrc0c DN 14.U8 69dSrc1c DN 16.U8 70dSrc2c DN 18.U8 71dSrc3c DN 20.U8 72 73dSrc0d DN 15.U8 74dSrc1d DN 17.U8 75dSrc2d DN 19.U8 76dSrc3d DN 21.U8 77 78qTemp01 QN 4.S16 79qTemp23 QN 6.S16 80dTemp0 DN 8.S16 81dTemp2 DN 12.S16 82 83qRes01 QN 11.S16 84qRes23 QN 12.S16 85qRes45 QN 13.S16 86qRes67 QN 14.S16 87 88dRes0 DN 22.S16 89dRes2 DN 24.S16 90dRes4 DN 26.S16 91dRes6 DN 28.S16 92 93dAcc0 DN 22.U8 94dAcc2 DN 24.U8 95dAcc4 DN 26.U8 96dAcc6 DN 28.U8 97 98dResult0 DN 22.U32 99dResult2 DN 24.U32 100dResult4 DN 26.U32 101dResult6 DN 28.U32 102 103 VLD1 qSrcA01, [pSrc], srcStep ;// Load A register [a0 a1 a2 a3 ..] 104 ;// One cycle stall 105 VEXT dSrcf, dSrcA0, dSrcA1, #5 ;// [f0 f1 f2 f3 ..] 106 VEXT dSrcb, dSrcA0, dSrcA1, #1 ;// [b0 b1 b2 b3 ..] 107; VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..] 108 VEXT dSrc0c, dSrcA0, dSrcA1, #2 109 VEXT dSrc0d, dSrcA0, dSrcA1, #3 110 VEXT dSrce, dSrcA0, dSrcA1, #4 111 VADDL qRes01, dSrcA0, dSrcf ;// Acc=a+f 112 VADDL qTemp01, dSrc0c, dSrc0d ;// c+d 113 VADDL qTemp23, dSrcb, dSrce ;// b+e 114 115 VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..] 116; VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..] 117 VMLA dRes0, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 118; VMLS dRes0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 119 VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi 120 121 VEXT dSrcf, dSrcB0, dSrcB1, #5 ;// [f0 f1 f2 f3 ..] 122 VEXT dSrcb, dSrcB0, dSrcB1, #1 ;// [b0 b1 b2 b3 ..] 123 VEXT dSrc1c, dSrcB0, dSrcB1, #2 124 VEXT dSrc1d, dSrcB0, dSrcB1, #3 125 VEXT dSrce, dSrcB0, dSrcB1, #4 126 VADDL qRes23, dSrcB0, dSrcf ;// Acc=a+f 127 128 VSUB dRes0, dRes0, dTemp0 ;// TeRi 129 130 VADDL qTemp01, dSrc1c, dSrc1d ;// c+d 131 VADDL qTemp23, dSrcb, dSrce ;// b+e 132 133 VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..] 134; VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..] 135 136 VMLA dRes2, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 137; VMLS dRes2, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 138 VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi 139 140 VEXT dSrcf, dSrcC0, dSrcC1, #5 ;// [f0 f1 f2 f3 ..] 141 VEXT dSrcb, dSrcC0, dSrcC1, #1 ;// [b0 b1 b2 b3 ..] 142 VEXT dSrc2c, dSrcC0, dSrcC1, #2 143 VEXT dSrc2d, dSrcC0, dSrcC1, #3 144 VEXT dSrce, dSrcC0, dSrcC1, #4 145 VADDL qRes45, dSrcC0, dSrcf ;// Acc=a+f 146 147 VSUB dRes2, dRes2, dTemp0 ;// TeRi 148 149 VADDL qTemp01, dSrc2c, dSrc2d ;// c+d 150 VADDL qTemp23, dSrcb, dSrce ;// b+e 151 152 VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..] 153 154 VMLA dRes4, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 155; VMLS dRes4, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 156 VMUL dTemp0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) TeRi 157 158 159 VEXT dSrcf, dSrcD0, dSrcD1, #5 ;// [f0 f1 f2 f3 ..] 160 VEXT dSrcb, dSrcD0, dSrcD1, #1 ;// [b0 b1 b2 b3 ..] 161 VEXT dSrc3c, dSrcD0, dSrcD1, #2 162 VEXT dSrc3d, dSrcD0, dSrcD1, #3 163 VEXT dSrce, dSrcD0, dSrcD1, #4 164 VADDL qRes67, dSrcD0, dSrcf ;// Acc=a+f 165 166 VSUB dRes4, dRes4, dTemp0 ;// TeRi 167 168 VADDL qTemp01, dSrc3c, dSrc3d ;// c+d 169 VADDL qTemp23, dSrcb, dSrce ;// b+e 170 VMLA dRes6, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 171 VMLS dRes6, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 172 173 VQRSHRUN dAcc0, qRes01, #5 ;// Acc = Sat ((Acc + 16) / 32) 174 VQRSHRUN dAcc2, qRes23, #5 ;// Acc = Sat ((Acc + 16) / 32) 175 VQRSHRUN dAcc4, qRes45, #5 ;// Acc = Sat ((Acc + 16) / 32) 176 VQRSHRUN dAcc6, qRes67, #5 ;// Acc = Sat ((Acc + 16) / 32) 177 178 M_END 179 180 ENDIF 181 182 183 END 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243