1;//
2;//
3;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13        INCLUDE omxtypes_s.h
14        INCLUDE armCOMM_s.h
15
16        EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
17
18        M_VARIANTS CortexA8
19
20    IF CortexA8
21
22        M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11
23
24;// Declare input registers
25pSrc            RN 0
26srcStep         RN 1
27pDst            RN 2
28dstStep         RN 3
29
30;// Declare Neon registers
31dCoeff5         DN 30.S16
32dCoeff20        DN 31.S16
33qCoeff5         QN 14.S32
34qCoeff20        QN 15.S32
35
36qSrc01          QN 0.U8
37dSrc0           DN 0.U8
38dSrc1           DN 1.U8
39
40dSrcb           DN 4.U8
41dSrcc           DN 2.U8
42dSrcd           DN 3.U8
43dSrce           DN 5.U8
44dSrcf           DN 1.U8
45
46qSrcb           QN 2.S16
47qSrcc           QN 1.S16
48dSrcB           DN 4.S16
49dSrcC           DN 2.S16
50
51qRes0           QN 5.S16
52qRes1           QN 6.S16
53qRes2           QN 7.S16
54qRes3           QN 8.S16
55qRes4           QN 9.S16
56qRes5           QN 10.S16
57qRes6           QN 11.S16
58qRes7           QN 12.S16
59qRes8           QN 13.S16
60
61dRes0           DN 10.S16
62dRes1           DN 12.S16
63dRes2           DN 14.S16
64dRes3           DN 16.S16
65dRes4           DN 18.S16
66dRes5           DN 20.S16
67dRes6           DN 22.S16
68dRes7           DN 24.S16
69dRes8           DN 26.S16
70
71qAcc01          QN 5.S32
72qAcc23          QN 6.S32
73qAcc45          QN 2.S32
74qAcc67          QN 3.S32
75qSumBE          QN 0.S32
76qSumCD          QN 1.S32
77
78dTempAcc0       DN 0.U16
79dTempAcc1       DN 2.U16
80dTempAcc2       DN 4.U16
81dTempAcc3       DN 6.U16
82
83qTAcc0          QN 0.U16
84qTAcc1          QN 1.U16
85qTAcc2          QN 2.U16
86qTAcc3          QN 3.U16
87
88dAcc0           DN 0.U8
89dAcc1           DN 2.U8
90dAcc2           DN 4.U8
91dAcc3           DN 6.U8
92
93dTmp0           DN 8.S16
94dTmp1           DN 9.S16
95qTmp0           QN 4.S32
96
97        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
98        VMOV        dCoeff20, #20
99        VMOV        dCoeff5, #5
100
101        ;// Row0
102        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
103        VEXT        dSrcc, dSrc0, dSrc1, #2
104        VEXT        dSrcd, dSrc0, dSrc1, #3
105        VEXT        dSrce, dSrc0, dSrc1, #4
106        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
107        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
108        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
109        VADDL       qRes0, dSrc0, dSrcf         ;// Acc=a+f
110        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
111        VMLA        dRes0, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
112;        VMLS        dRes0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
113        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
114
115        ;// Row1
116        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
117        VEXT        dSrcc, dSrc0, dSrc1, #2
118        VEXT        dSrcd, dSrc0, dSrc1, #3
119        VEXT        dSrce, dSrc0, dSrc1, #4
120        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
121        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
122        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
123        VADDL       qRes1, dSrc0, dSrcf         ;// Acc=a+f
124        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
125
126        VSUB        dRes0, dRes0, dTmp0 ;// TeRi
127
128        VMLA        dRes1, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
129;        VMLS        dRes1, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
130        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
131
132        ;// Row2
133        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
134        VEXT        dSrcc, dSrc0, dSrc1, #2
135        VEXT        dSrcd, dSrc0, dSrc1, #3
136        VEXT        dSrce, dSrc0, dSrc1, #4
137        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
138        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
139        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
140        VADDL       qRes2, dSrc0, dSrcf         ;// Acc=a+f
141        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
142
143        VSUB        dRes1, dRes1, dTmp0
144
145        VMLA        dRes2, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
146;        VMLS        dRes2, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
147        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
148
149        ;// Row3
150        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
151        VEXT        dSrcc, dSrc0, dSrc1, #2
152        VEXT        dSrcd, dSrc0, dSrc1, #3
153        VEXT        dSrce, dSrc0, dSrc1, #4
154        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
155        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
156        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
157        VADDL       qRes3, dSrc0, dSrcf         ;// Acc=a+f
158        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
159
160        VSUB        dRes2, dRes2, dTmp0
161
162        VMLA        dRes3, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
163;        VMLS        dRes3, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
164        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
165
166        ;// Row4
167        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
168        VEXT        dSrcc, dSrc0, dSrc1, #2
169        VEXT        dSrcd, dSrc0, dSrc1, #3
170        VEXT        dSrce, dSrc0, dSrc1, #4
171        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
172        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
173        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
174        VADDL       qRes4, dSrc0, dSrcf         ;// Acc=a+f
175        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
176
177        VSUB        dRes3, dRes3, dTmp0
178
179        VMLA        dRes4, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
180;        VMLS        dRes4, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
181        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
182
183        ;// Row5
184        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
185        VEXT        dSrcc, dSrc0, dSrc1, #2
186        VEXT        dSrcd, dSrc0, dSrc1, #3
187        VEXT        dSrce, dSrc0, dSrc1, #4
188        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
189        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
190        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
191        VADDL       qRes5, dSrc0, dSrcf         ;// Acc=a+f
192        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
193
194        VSUB        dRes4, dRes4, dTmp0
195
196        VMLA        dRes5, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
197;        VMLS        dRes5, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
198        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
199
200        ;// Row6
201        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
202        VEXT        dSrcc, dSrc0, dSrc1, #2
203        VEXT        dSrcd, dSrc0, dSrc1, #3
204        VEXT        dSrce, dSrc0, dSrc1, #4
205        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
206        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
207        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
208        VADDL       qRes6, dSrc0, dSrcf         ;// Acc=a+f
209        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
210
211        VSUB        dRes5, dRes5, dTmp0
212
213        VMLA        dRes6, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
214;        VMLS        dRes6, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
215        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
216
217        ;// Row7
218        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
219        VEXT        dSrcc, dSrc0, dSrc1, #2
220        VEXT        dSrcd, dSrc0, dSrc1, #3
221        VEXT        dSrce, dSrc0, dSrc1, #4
222        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
223        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
224        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
225        VADDL       qRes7, dSrc0, dSrcf         ;// Acc=a+f
226        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
227
228        VSUB        dRes6, dRes6, dTmp0
229
230        VMLA        dRes7, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
231;        VMLS        dRes7, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
232        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
233
234        ;// Row8
235        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
236        VEXT        dSrcc, dSrc0, dSrc1, #2
237        VEXT        dSrcd, dSrc0, dSrc1, #3
238        VEXT        dSrce, dSrc0, dSrc1, #4
239        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
240        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
241        VADDL       qSrcb, dSrcb, dSrce         ;// b+e
242        VADDL       qRes8, dSrc0, dSrcf         ;// Acc=a+f
243
244        VSUB        dRes7, dRes7, dTmp0
245
246        VMLA        dRes8, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
247;        VMLS        dRes8, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
248        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
249
250        VMOV        qCoeff20, #20
251        VMOV        qCoeff5, #5
252
253        ;// Col0
254        VADDL       qAcc01, dRes0, dRes5        ;// Acc = a+f
255        VADDL       qSumCD, dRes2, dRes3        ;// c+d
256        VADDL       qSumBE, dRes1, dRes4        ;// b+e
257
258        VSUB        dRes8, dRes8, dTmp0
259
260        VMLA        qAcc01, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
261;        VMLS        qAcc01, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
262        VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
263
264        ;// Col1
265        VADDL       qAcc23, dRes1, dRes6        ;// Acc = a+f
266        VADDL       qSumCD, dRes3, dRes4        ;// c+d
267        VADDL       qSumBE, dRes2, dRes5        ;// b+e
268        VMLA        qAcc23, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
269
270        VSUB        qAcc01, qAcc01, qTmp0
271
272;        VMLS        qAcc23, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
273        VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
274
275        ;// Col2
276        VADDL       qAcc45, dRes2, dRes7        ;// Acc = a+f
277        VADDL       qSumCD, dRes4, dRes5        ;// c+d
278        VADDL       qSumBE, dRes3, dRes6        ;// b+e
279        VMLA        qAcc45, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
280
281        VSUB        qAcc23, qAcc23, qTmp0
282
283;        VMLS        qAcc45, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
284        VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
285
286        ;// Col3
287        VADDL       qAcc67, dRes3, dRes8        ;// Acc = a+f
288        VADDL       qSumCD, dRes5, dRes6        ;// c+d
289        VADDL       qSumBE, dRes4, dRes7        ;// b+e
290        VMLA        qAcc67, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
291
292        VSUB        qAcc45, qAcc45, qTmp0
293
294        VMLS        qAcc67, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
295
296        VQRSHRUN    dTempAcc0, qAcc01, #10
297        VQRSHRUN    dTempAcc1, qAcc23, #10
298        VQRSHRUN    dTempAcc2, qAcc45, #10
299        VQRSHRUN    dTempAcc3, qAcc67, #10
300
301        VQMOVN      dAcc0, qTAcc0
302        VQMOVN      dAcc1, qTAcc1
303        VQMOVN      dAcc2, qTAcc2
304        VQMOVN      dAcc3, qTAcc3
305
306        M_END
307
308    ENDIF
309
310
311
312    END
313
314