armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS CortexA8
31
32        EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
33
34DEBUG_ON    SETL {FALSE}
35
36    IF CortexA8
37
38        M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r11
39
40;// Declare input registers
41pSrc            RN 0
42srcStep         RN 1
43pDst            RN 2
44dstStep         RN 3
45
46;// Declare Neon registers
47dCoeff5         DN 30.S16
48dCoeff20        DN 31.S16
49
50qSrcA01         QN 11.U8
51qSrcB01         QN 12.U8
52qSrcC01         QN 13.U8
53qSrcD01         QN 14.U8
54
55dSrcA0          DN 22.U8
56dSrcA1          DN 23.U8
57dSrcB0          DN 24.U8
58dSrcB1          DN 25.U8
59dSrcC0          DN 26.U8
60dSrcC1          DN 27.U8
61dSrcD0          DN 28.U8
62dSrcD1          DN 29.U8
63
64dSrcb           DN 12.U8
65dSrce           DN 13.U8
66dSrcf           DN 10.U8
67
68dSrc0c          DN 14.U8
69dSrc1c          DN 16.U8
70dSrc2c          DN 18.U8
71dSrc3c          DN 20.U8
72
73dSrc0d          DN 15.U8
74dSrc1d          DN 17.U8
75dSrc2d          DN 19.U8
76dSrc3d          DN 21.U8
77
78qTemp01         QN 4.S16
79qTemp23         QN 6.S16
80dTemp0          DN 8.S16
81dTemp2          DN 12.S16
82
83qRes01          QN 11.S16
84qRes23          QN 12.S16
85qRes45          QN 13.S16
86qRes67          QN 14.S16
87
88dRes0           DN 22.S16
89dRes2           DN 24.S16
90dRes4           DN 26.S16
91dRes6           DN 28.S16
92
93dAcc0           DN 22.U8
94dAcc2           DN 24.U8
95dAcc4           DN 26.U8
96dAcc6           DN 28.U8
97
98dResult0        DN 22.U32
99dResult2        DN 24.U32
100dResult4        DN 26.U32
101dResult6        DN 28.U32
102
103        VLD1        qSrcA01, [pSrc], srcStep    ;// Load A register [a0 a1 a2 a3 ..]
104        ;// One cycle stall
105        VEXT        dSrcf, dSrcA0, dSrcA1, #5   ;// [f0 f1 f2 f3 ..]
106        VEXT        dSrcb, dSrcA0, dSrcA1, #1   ;// [b0 b1 b2 b3 ..]
107;        VLD1        qSrcB01, [pSrc], srcStep    ;// Load B register [a0 a1 a2 a3 ..]
108        VEXT        dSrc0c, dSrcA0, dSrcA1, #2
109        VEXT        dSrc0d, dSrcA0, dSrcA1, #3
110        VEXT        dSrce, dSrcA0, dSrcA1, #4
111        VADDL       qRes01, dSrcA0, dSrcf       ;// Acc=a+f
112        VADDL       qTemp01, dSrc0c, dSrc0d     ;// c+d
113        VADDL       qTemp23, dSrcb, dSrce       ;// b+e
114
115        VLD1        qSrcB01, [pSrc], srcStep    ;// Load B register [a0 a1 a2 a3 ..]
116;        VLD1        qSrcC01, [pSrc], srcStep    ;// Load C register [a0 a1 a2 a3 ..]
117        VMLA        dRes0, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
118;        VMLS        dRes0, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
119        VMUL        dTemp0, dTemp2, dCoeff5 ;// TeRi
120
121        VEXT        dSrcf, dSrcB0, dSrcB1, #5   ;// [f0 f1 f2 f3 ..]
122        VEXT        dSrcb, dSrcB0, dSrcB1, #1   ;// [b0 b1 b2 b3 ..]
123        VEXT        dSrc1c, dSrcB0, dSrcB1, #2
124        VEXT        dSrc1d, dSrcB0, dSrcB1, #3
125        VEXT        dSrce, dSrcB0, dSrcB1, #4
126        VADDL       qRes23, dSrcB0, dSrcf       ;// Acc=a+f
127
128        VSUB        dRes0, dRes0, dTemp0    ;// TeRi
129
130        VADDL       qTemp01, dSrc1c, dSrc1d     ;// c+d
131        VADDL       qTemp23, dSrcb, dSrce       ;// b+e
132
133        VLD1        qSrcC01, [pSrc], srcStep    ;// Load C register [a0 a1 a2 a3 ..]
134;        VLD1        qSrcD01, [pSrc], srcStep    ;// Load D register [a0 a1 a2 a3 ..]
135
136        VMLA        dRes2, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
137;        VMLS        dRes2, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
138        VMUL        dTemp0, dTemp2, dCoeff5 ;// TeRi
139
140        VEXT        dSrcf, dSrcC0, dSrcC1, #5   ;// [f0 f1 f2 f3 ..]
141        VEXT        dSrcb, dSrcC0, dSrcC1, #1   ;// [b0 b1 b2 b3 ..]
142        VEXT        dSrc2c, dSrcC0, dSrcC1, #2
143        VEXT        dSrc2d, dSrcC0, dSrcC1, #3
144        VEXT        dSrce, dSrcC0, dSrcC1, #4
145        VADDL       qRes45, dSrcC0, dSrcf       ;// Acc=a+f
146
147        VSUB        dRes2, dRes2, dTemp0  ;// TeRi
148
149        VADDL       qTemp01, dSrc2c, dSrc2d     ;// c+d
150        VADDL       qTemp23, dSrcb, dSrce       ;// b+e
151
152        VLD1        qSrcD01, [pSrc], srcStep    ;// Load D register [a0 a1 a2 a3 ..]
153
154        VMLA        dRes4, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
155;        VMLS        dRes4, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
156        VMUL        dTemp0, dTemp2, dCoeff5      ;// Acc -= 5*(b+e) TeRi
157
158
159        VEXT        dSrcf, dSrcD0, dSrcD1, #5   ;// [f0 f1 f2 f3 ..]
160        VEXT        dSrcb, dSrcD0, dSrcD1, #1   ;// [b0 b1 b2 b3 ..]
161        VEXT        dSrc3c, dSrcD0, dSrcD1, #2
162        VEXT        dSrc3d, dSrcD0, dSrcD1, #3
163        VEXT        dSrce, dSrcD0, dSrcD1, #4
164        VADDL       qRes67, dSrcD0, dSrcf       ;// Acc=a+f
165
166        VSUB        dRes4, dRes4, dTemp0 ;// TeRi
167
168        VADDL       qTemp01, dSrc3c, dSrc3d     ;// c+d
169        VADDL       qTemp23, dSrcb, dSrce       ;// b+e
170        VMLA        dRes6, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
171        VMLS        dRes6, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
172
173        VQRSHRUN    dAcc0, qRes01, #5           ;// Acc = Sat ((Acc + 16) / 32)
174        VQRSHRUN    dAcc2, qRes23, #5           ;// Acc = Sat ((Acc + 16) / 32)
175        VQRSHRUN    dAcc4, qRes45, #5           ;// Acc = Sat ((Acc + 16) / 32)
176        VQRSHRUN    dAcc6, qRes67, #5           ;// Acc = Sat ((Acc + 16) / 32)
177
178        M_END
179
180    ENDIF
181
182
183    END
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243