armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
31
32        M_VARIANTS CortexA8
33
34    IF CortexA8
35
36        M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11
37
38;// Declare input registers
39pSrc            RN 0
40srcStep         RN 1
41pDst            RN 2
42dstStep         RN 3
43
44;// Declare Neon registers
45dCoeff5         DN 30.S16
46dCoeff20        DN 31.S16
47qCoeff5         QN 14.S32
48qCoeff20        QN 15.S32
49
50qSrc01          QN 0.U8
51dSrc0           DN 0.U8
52dSrc1           DN 1.U8
53
54dSrcb           DN 4.U8
55dSrcc           DN 2.U8
56dSrcd           DN 3.U8
57dSrce           DN 5.U8
58dSrcf           DN 1.U8
59
60qSrcb           QN 2.S16
61qSrcc           QN 1.S16
62dSrcB           DN 4.S16
63dSrcC           DN 2.S16
64
65qRes0           QN 5.S16
66qRes1           QN 6.S16
67qRes2           QN 7.S16
68qRes3           QN 8.S16
69qRes4           QN 9.S16
70qRes5           QN 10.S16
71qRes6           QN 11.S16
72qRes7           QN 12.S16
73qRes8           QN 13.S16
74
75dRes0           DN 10.S16
76dRes1           DN 12.S16
77dRes2           DN 14.S16
78dRes3           DN 16.S16
79dRes4           DN 18.S16
80dRes5           DN 20.S16
81dRes6           DN 22.S16
82dRes7           DN 24.S16
83dRes8           DN 26.S16
84
85qAcc01          QN 5.S32
86qAcc23          QN 6.S32
87qAcc45          QN 2.S32
88qAcc67          QN 3.S32
89qSumBE          QN 0.S32
90qSumCD          QN 1.S32
91
92dTempAcc0       DN 0.U16
93dTempAcc1       DN 2.U16
94dTempAcc2       DN 4.U16
95dTempAcc3       DN 6.U16
96
97qTAcc0          QN 0.U16
98qTAcc1          QN 1.U16
99qTAcc2          QN 2.U16
100qTAcc3          QN 3.U16
101
102dAcc0           DN 0.U8
103dAcc1           DN 2.U8
104dAcc2           DN 4.U8
105dAcc3           DN 6.U8
106
107dTmp0           DN 8.S16
108dTmp1           DN 9.S16
109qTmp0           QN 4.S32
110
111        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
112        VMOV        dCoeff20, #20
113        VMOV        dCoeff5, #5
114
115        ;// Row0
116        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
117        VEXT        dSrcc, dSrc0, dSrc1, #2
118        VEXT        dSrcd, dSrc0, dSrc1, #3
119        VEXT        dSrce, dSrc0, dSrc1, #4
120        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
121        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
122        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
123        VADDL       qRes0, dSrc0, dSrcf         ;// Acc=a+f
124        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
125        VMLA        dRes0, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
126;        VMLS        dRes0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
127        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
128
129        ;// Row1
130        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
131        VEXT        dSrcc, dSrc0, dSrc1, #2
132        VEXT        dSrcd, dSrc0, dSrc1, #3
133        VEXT        dSrce, dSrc0, dSrc1, #4
134        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
135        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
136        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
137        VADDL       qRes1, dSrc0, dSrcf         ;// Acc=a+f
138        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
139
140        VSUB        dRes0, dRes0, dTmp0 ;// TeRi
141
142        VMLA        dRes1, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
143;        VMLS        dRes1, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
144        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
145
146        ;// Row2
147        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
148        VEXT        dSrcc, dSrc0, dSrc1, #2
149        VEXT        dSrcd, dSrc0, dSrc1, #3
150        VEXT        dSrce, dSrc0, dSrc1, #4
151        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
152        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
153        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
154        VADDL       qRes2, dSrc0, dSrcf         ;// Acc=a+f
155        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
156
157        VSUB        dRes1, dRes1, dTmp0
158
159        VMLA        dRes2, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
160;        VMLS        dRes2, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
161        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
162
163        ;// Row3
164        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
165        VEXT        dSrcc, dSrc0, dSrc1, #2
166        VEXT        dSrcd, dSrc0, dSrc1, #3
167        VEXT        dSrce, dSrc0, dSrc1, #4
168        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
169        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
170        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
171        VADDL       qRes3, dSrc0, dSrcf         ;// Acc=a+f
172        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
173
174        VSUB        dRes2, dRes2, dTmp0
175
176        VMLA        dRes3, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
177;        VMLS        dRes3, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
178        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
179
180        ;// Row4
181        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
182        VEXT        dSrcc, dSrc0, dSrc1, #2
183        VEXT        dSrcd, dSrc0, dSrc1, #3
184        VEXT        dSrce, dSrc0, dSrc1, #4
185        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
186        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
187        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
188        VADDL       qRes4, dSrc0, dSrcf         ;// Acc=a+f
189        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
190
191        VSUB        dRes3, dRes3, dTmp0
192
193        VMLA        dRes4, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
194;        VMLS        dRes4, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
195        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
196
197        ;// Row5
198        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
199        VEXT        dSrcc, dSrc0, dSrc1, #2
200        VEXT        dSrcd, dSrc0, dSrc1, #3
201        VEXT        dSrce, dSrc0, dSrc1, #4
202        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
203        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
204        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
205        VADDL       qRes5, dSrc0, dSrcf         ;// Acc=a+f
206        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
207
208        VSUB        dRes4, dRes4, dTmp0
209
210        VMLA        dRes5, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
211;        VMLS        dRes5, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
212        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
213
214        ;// Row6
215        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
216        VEXT        dSrcc, dSrc0, dSrc1, #2
217        VEXT        dSrcd, dSrc0, dSrc1, #3
218        VEXT        dSrce, dSrc0, dSrc1, #4
219        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
220        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
221        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
222        VADDL       qRes6, dSrc0, dSrcf         ;// Acc=a+f
223        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
224
225        VSUB        dRes5, dRes5, dTmp0
226
227        VMLA        dRes6, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
228;        VMLS        dRes6, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
229        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
230
231        ;// Row7
232        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
233        VEXT        dSrcc, dSrc0, dSrc1, #2
234        VEXT        dSrcd, dSrc0, dSrc1, #3
235        VEXT        dSrce, dSrc0, dSrc1, #4
236        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
237        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
238        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
239        VADDL       qRes7, dSrc0, dSrcf         ;// Acc=a+f
240        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
241
242        VSUB        dRes6, dRes6, dTmp0
243
244        VMLA        dRes7, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
245;        VMLS        dRes7, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
246        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
247
248        ;// Row8
249        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
250        VEXT        dSrcc, dSrc0, dSrc1, #2
251        VEXT        dSrcd, dSrc0, dSrc1, #3
252        VEXT        dSrce, dSrc0, dSrc1, #4
253        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
254        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
255        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
256        VADDL       qRes8, dSrc0, dSrcf         ;// Acc=a+f
257
258        VSUB        dRes7, dRes7, dTmp0
259
260        VMLA        dRes8, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
261;        VMLS        dRes8, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
262        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
263
264        VMOV        qCoeff20, #20
265        VMOV        qCoeff5, #5
266
267        ;// Col0
268        VADDL       qAcc01, dRes0, dRes5        ;// Acc = a+f
269        VADDL       qSumCD, dRes2, dRes3        ;// c+d
270        VADDL       qSumBE, dRes1, dRes4        ;// b+e
271
272        VSUB        dRes8, dRes8, dTmp0
273
274        VMLA        qAcc01, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
275;        VMLS        qAcc01, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
276        VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
277
278        ;// Col1
279        VADDL       qAcc23, dRes1, dRes6        ;// Acc = a+f
280        VADDL       qSumCD, dRes3, dRes4        ;// c+d
281        VADDL       qSumBE, dRes2, dRes5        ;// b+e
282        VMLA        qAcc23, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
283
284        VSUB        qAcc01, qAcc01, qTmp0
285
286;        VMLS        qAcc23, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
287        VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
288
289        ;// Col2
290        VADDL       qAcc45, dRes2, dRes7        ;// Acc = a+f
291        VADDL       qSumCD, dRes4, dRes5        ;// c+d
292        VADDL       qSumBE, dRes3, dRes6        ;// b+e
293        VMLA        qAcc45, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
294
295        VSUB        qAcc23, qAcc23, qTmp0
296
297;        VMLS        qAcc45, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
298        VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
299
300        ;// Col3
301        VADDL       qAcc67, dRes3, dRes8        ;// Acc = a+f
302        VADDL       qSumCD, dRes5, dRes6        ;// c+d
303        VADDL       qSumBE, dRes4, dRes7        ;// b+e
304        VMLA        qAcc67, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
305
306        VSUB        qAcc45, qAcc45, qTmp0
307
308        VMLS        qAcc67, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
309
310        VQRSHRUN    dTempAcc0, qAcc01, #10
311        VQRSHRUN    dTempAcc1, qAcc23, #10
312        VQRSHRUN    dTempAcc2, qAcc45, #10
313        VQRSHRUN    dTempAcc3, qAcc67, #10
314
315        VQMOVN      dAcc0, qTAcc0
316        VQMOVN      dAcc1, qTAcc1
317        VQMOVN      dAcc2, qTAcc2
318        VQMOVN      dAcc3, qTAcc3
319
320        M_END
321
322    ENDIF
323
324
325
326    END
327
328