1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        EXPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
31
32        M_VARIANTS CortexA8
33
34    IF CortexA8
35        M_START armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe, r11
36
37;// Declare input registers
38pSrc            RN 0
39srcStep         RN 1
40pDst            RN 2
41dstStep         RN 3
42
43;// Declare Neon registers
44dTCoeff5        DN 30.U8
45dTCoeff20       DN 31.U8
46dCoeff5         DN 30.S16
47dCoeff20        DN 31.S16
48
49qSrcA01         QN 0.U8
50qSrcB23         QN 1.U8
51qSrcC45         QN 2.U8
52qSrcD67         QN 3.U8
53qSrcE89         QN 4.U8
54qSrcF1011       QN 5.U8
55qSrcG1213       QN 6.U8
56qSrcH1415       QN 7.U8
57qSrcI1617       QN 8.U8
58
59dSrcA0          DN 0.U8
60dSrcB2          DN 2.U8
61dSrcC4          DN 4.U8
62dSrcD6          DN 6.U8
63dSrcE8          DN 8.U8
64dSrcF10         DN 10.U8
65dSrcG12         DN 12.U8
66dSrcH14         DN 14.U8
67dSrcI16         DN 16.U8
68
69dSrcA1          DN 1.U8
70dSrcB3          DN 3.U8
71dSrcC5          DN 5.U8
72dSrcD7          DN 7.U8
73dSrcE9          DN 9.U8
74dSrcF11         DN 11.U8
75dSrcG13         DN 13.U8
76dSrcH15         DN 15.U8
77dSrcI17         DN 17.U8
78
79qTempP01        QN 9.S16
80qTempQ01        QN 10.S16
81qTempR01        QN 11.S16
82qTempS01        QN 12.S16
83
84qTempP23        QN 0.S16
85qTempQ23        QN 1.S16
86qTempR23        QN 2.S16
87qTempS23        QN 3.S16
88
89dTempP0         DN 18.S16
90dTempP1         DN 19.S16
91dTempP2         DN 0.S16
92
93dTempQ0         DN 20.S16
94dTempQ1         DN 21.S16
95dTempQ2         DN 2.S16
96
97dTempR0         DN 22.S16
98dTempR1         DN 23.S16
99dTempR2         DN 4.S16
100
101dTempS0         DN 24.S16
102dTempS1         DN 25.S16
103dTempS2         DN 6.S16
104
105dTempB0         DN 26.S16
106dTempC0         DN 27.S16
107dTempD0         DN 28.S16
108dTempF0         DN 29.S16
109
110dTempAcc0       DN 0.U16
111dTempAcc1       DN 2.U16
112dTempAcc2       DN 4.U16
113dTempAcc3       DN 6.U16
114
115dAcc0           DN 0.U8
116dAcc1           DN 2.U8
117dAcc2           DN 4.U8
118dAcc3           DN 6.U8
119
120qAcc0           QN 0.S32
121qAcc1           QN 1.S32
122qAcc2           QN 2.S32
123qAcc3           QN 3.S32
124
125qTAcc0          QN 0.U16
126qTAcc1          QN 1.U16
127qTAcc2          QN 2.U16
128qTAcc3          QN 3.U16
129
130qTmp            QN 4.S16
131dTmp            DN 8.S16
132
133        VLD1        qSrcA01, [pSrc], srcStep                 ;// [a0 a1 a2 a3 .. a15]
134        ADD         r12, pSrc, srcStep, LSL #2
135        VMOV        dTCoeff5, #5
136        VMOV        dTCoeff20, #20
137        VLD1        qSrcF1011, [r12], srcStep
138        VLD1        qSrcB23, [pSrc], srcStep                 ;// [b0 b1 b2 b3 .. b15]
139
140        VLD1        qSrcG1213, [r12], srcStep
141        VADDL       qTempP01, dSrcA0, dSrcF10
142        VLD1        qSrcC45, [pSrc], srcStep                 ;// [c0 c1 c2 c3 .. c15]
143        VADDL       qTempP23, dSrcA1, dSrcF11
144        VLD1        qSrcD67, [pSrc], srcStep
145        VADDL       qTempQ01, dSrcB2, dSrcG12
146        VLD1        qSrcE89, [pSrc], srcStep
147
148        ;//t0
149        VMLAL       qTempP01, dSrcC4, dTCoeff20
150
151        VLD1        qSrcH1415, [r12], srcStep
152
153        VMLAL       qTempP23, dSrcC5, dTCoeff20
154
155        VLD1        qSrcI1617, [r12], srcStep                 ;// [i0 i1 i2 i3 .. ]
156
157        VMLAL       qTempP01, dSrcD6, dTCoeff20
158        VMLAL       qTempQ01, dSrcD6, dTCoeff20
159        VMLSL       qTempP23, dSrcB3, dTCoeff5
160
161        VADDL       qTempR01, dSrcC4, dSrcH14
162
163        VMLSL       qTempP01, dSrcB2, dTCoeff5
164
165        VADDL       qTempQ23, dSrcB3, dSrcG13
166
167        VMLAL       qTempP23, dSrcD7, dTCoeff20
168        VMLAL       qTempQ01, dSrcE8, dTCoeff20
169
170        VMLSL       qTempP01, dSrcE8, dTCoeff5
171        VMLAL       qTempQ23, dSrcD7, dTCoeff20
172
173        VMLSL       qTempP23, dSrcE9, dTCoeff5
174
175        ;//t1
176
177        VMLAL       qTempR01, dSrcE8, dTCoeff20
178        VMLSL       qTempQ01, dSrcC4, dTCoeff5
179        VMLSL       qTempQ23, dSrcC5, dTCoeff5
180        VADDL       qTempR23, dSrcC5, dSrcH15
181
182        VMLAL       qTempR01, dSrcF10, dTCoeff20
183        VMLSL       qTempQ01, dSrcF10, dTCoeff5
184        VMLAL       qTempQ23, dSrcE9, dTCoeff20
185        VMLAL       qTempR23, dSrcE9, dTCoeff20
186        VADDL       qTempS01, dSrcD6, dSrcI16
187
188
189        VMLSL       qTempR01, dSrcD6, dTCoeff5
190        VMLSL       qTempQ23, dSrcF11, dTCoeff5
191        VMLSL       qTempR23, dSrcD7, dTCoeff5
192
193        ;//t2
194        VADDL       qTempS23, dSrcD7, dSrcI17
195        VMLAL       qTempS01, dSrcF10, dTCoeff20
196        VMLSL       qTempR01, dSrcG12, dTCoeff5
197        VMLSL       qTempR23, dSrcG13, dTCoeff5
198
199        VMLAL       qTempS23, dSrcF11, dTCoeff20
200        VMLAL       qTempS01, dSrcG12, dTCoeff20
201        VEXT        dTempB0, dTempP0, dTempP1, #1
202        VMLAL       qTempR23, dSrcF11, dTCoeff20
203
204
205        ;//t3
206        VMLAL       qTempS23, dSrcG13, dTCoeff20
207        VMLSL       qTempS01, dSrcE8, dTCoeff5
208        VEXT        dTempC0, dTempP0, dTempP1, #2
209        VMOV        dCoeff20, #20
210        VMLSL       qTempS23, dSrcE9, dTCoeff5
211        VMLSL       qTempS01, dSrcH14, dTCoeff5
212        VEXT        dTempF0, dTempP1, dTempP2, #1
213        VEXT        dTempD0, dTempP0, dTempP1, #3
214        VMLSL       qTempS23, dSrcH15, dTCoeff5
215
216        VADDL       qAcc0, dTempP0, dTempF0
217        VADD        dTempC0, dTempC0, dTempD0
218        ;//h
219        VMOV        dCoeff5, #5
220
221        ;// res0
222        VADD        dTempB0, dTempB0, dTempP1
223        VMLAL       qAcc0, dTempC0, dCoeff20
224        VEXT        dTempC0, dTempQ0, dTempQ1, #2
225        VEXT        dTempD0, dTempQ0, dTempQ1, #3
226        VEXT        dTempF0, dTempQ1, dTempQ2, #1
227        VMLSL       qAcc0, dTempB0, dCoeff5
228
229        ;// res1
230        VEXT        dTempB0, dTempQ0, dTempQ1, #1
231        VADDL       qAcc1, dTempQ0, dTempF0
232        VADD        dTempC0, dTempC0, dTempD0
233        VADD        dTempB0, dTempB0, dTempQ1
234        VEXT        dTempD0, dTempR0, dTempR1, #3
235        VMLAL       qAcc1, dTempC0, dCoeff20
236        VEXT        dTempF0, dTempR1, dTempR2, #1
237        VEXT        dTempC0, dTempR0, dTempR1, #2
238        VEXT        dTmp, dTempR0, dTempR1, #1
239        VADDL       qAcc2, dTempR0, dTempF0
240        VMLSL       qAcc1, dTempB0, dCoeff5
241;        VEXT        dTempB0, dTempR0, dTempR1, #1
242        VADD        dTempC0, dTempC0, dTempD0
243
244        ;// res2
245        VADD        dTempB0, dTmp, dTempR1
246        VEXT        dTempD0, dTempS0, dTempS1, #3
247        VMLAL       qAcc2, dTempC0, dCoeff20
248;        VADD        dTempB0, dTempB0, dTempR1
249
250        ;// res3
251        VEXT        dTempC0, dTempS0, dTempS1, #2
252        VEXT        dTempF0, dTempS1, dTempS2, #1
253        VADD        dTempC0, dTempC0, dTempD0
254        VEXT        dTmp, dTempS0, dTempS1, #1
255        VADDL       qAcc3, dTempS0, dTempF0
256        VMLSL       qAcc2, dTempB0, dCoeff5
257        VMLAL       qAcc3, dTempC0, dCoeff20
258        VADD        dTmp, dTmp, dTempS1
259        VMLSL       qAcc3, dTmp, dCoeff5
260
261        VQRSHRUN    dTempAcc0, qAcc0, #10
262        VQRSHRUN    dTempAcc1, qAcc1, #10
263        VQRSHRUN    dTempAcc2, qAcc2, #10
264        VQRSHRUN    dTempAcc3, qAcc3, #10
265
266        VQMOVN      dAcc0, qTAcc0
267        VQMOVN      dAcc1, qTAcc1
268        VQMOVN      dAcc2, qTAcc2
269        VQMOVN      dAcc3, qTAcc3
270
271        M_END
272
273    ENDIF
274
275
276
277
278
279    END
280
281