1;//
2;//
3;// File Name:  armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13        INCLUDE omxtypes_s.h
14        INCLUDE armCOMM_s.h
15
16        M_VARIANTS CortexA8
17
18        EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
19
20DEBUG_ON    SETL {FALSE}
21
22    IF CortexA8
23
24        M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r11
25
26;// Declare input registers
27pSrc            RN 0
28srcStep         RN 1
29pDst            RN 2
30dstStep         RN 3
31
32;// Declare Neon registers
33dCoeff5         DN 30.S16
34dCoeff20        DN 31.S16
35
36qSrcA01         QN 11.U8
37qSrcB01         QN 12.U8
38qSrcC01         QN 13.U8
39qSrcD01         QN 14.U8
40
41dSrcA0          DN 22.U8
42dSrcA1          DN 23.U8
43dSrcB0          DN 24.U8
44dSrcB1          DN 25.U8
45dSrcC0          DN 26.U8
46dSrcC1          DN 27.U8
47dSrcD0          DN 28.U8
48dSrcD1          DN 29.U8
49
50dSrcb           DN 12.U8
51dSrce           DN 13.U8
52dSrcf           DN 10.U8
53
54dSrc0c          DN 14.U8
55dSrc1c          DN 16.U8
56dSrc2c          DN 18.U8
57dSrc3c          DN 20.U8
58
59dSrc0d          DN 15.U8
60dSrc1d          DN 17.U8
61dSrc2d          DN 19.U8
62dSrc3d          DN 21.U8
63
64qTemp01         QN 4.S16
65qTemp23         QN 6.S16
66dTemp0          DN 8.S16
67dTemp2          DN 12.S16
68
69qRes01          QN 11.S16
70qRes23          QN 12.S16
71qRes45          QN 13.S16
72qRes67          QN 14.S16
73
74dRes0           DN 22.S16
75dRes2           DN 24.S16
76dRes4           DN 26.S16
77dRes6           DN 28.S16
78
79dAcc0           DN 22.U8
80dAcc2           DN 24.U8
81dAcc4           DN 26.U8
82dAcc6           DN 28.U8
83
84dResult0        DN 22.U32
85dResult2        DN 24.U32
86dResult4        DN 26.U32
87dResult6        DN 28.U32
88
89        VLD1        qSrcA01, [pSrc], srcStep    ;// Load A register [a0 a1 a2 a3 ..]
90        ;// One cycle stall
91        VEXT        dSrcf, dSrcA0, dSrcA1, #5   ;// [f0 f1 f2 f3 ..]
92        VEXT        dSrcb, dSrcA0, dSrcA1, #1   ;// [b0 b1 b2 b3 ..]
93;        VLD1        qSrcB01, [pSrc], srcStep    ;// Load B register [a0 a1 a2 a3 ..]
94        VEXT        dSrc0c, dSrcA0, dSrcA1, #2
95        VEXT        dSrc0d, dSrcA0, dSrcA1, #3
96        VEXT        dSrce, dSrcA0, dSrcA1, #4
97        VADDL       qRes01, dSrcA0, dSrcf       ;// Acc=a+f
98        VADDL       qTemp01, dSrc0c, dSrc0d     ;// c+d
99        VADDL       qTemp23, dSrcb, dSrce       ;// b+e
100
101        VLD1        qSrcB01, [pSrc], srcStep    ;// Load B register [a0 a1 a2 a3 ..]
102;        VLD1        qSrcC01, [pSrc], srcStep    ;// Load C register [a0 a1 a2 a3 ..]
103        VMLA        dRes0, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
104;        VMLS        dRes0, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
105        VMUL        dTemp0, dTemp2, dCoeff5 ;// TeRi
106
107        VEXT        dSrcf, dSrcB0, dSrcB1, #5   ;// [f0 f1 f2 f3 ..]
108        VEXT        dSrcb, dSrcB0, dSrcB1, #1   ;// [b0 b1 b2 b3 ..]
109        VEXT        dSrc1c, dSrcB0, dSrcB1, #2
110        VEXT        dSrc1d, dSrcB0, dSrcB1, #3
111        VEXT        dSrce, dSrcB0, dSrcB1, #4
112        VADDL       qRes23, dSrcB0, dSrcf       ;// Acc=a+f
113
114        VSUB        dRes0, dRes0, dTemp0    ;// TeRi
115
116        VADDL       qTemp01, dSrc1c, dSrc1d     ;// c+d
117        VADDL       qTemp23, dSrcb, dSrce       ;// b+e
118
119        VLD1        qSrcC01, [pSrc], srcStep    ;// Load C register [a0 a1 a2 a3 ..]
120;        VLD1        qSrcD01, [pSrc], srcStep    ;// Load D register [a0 a1 a2 a3 ..]
121
122        VMLA        dRes2, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
123;        VMLS        dRes2, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
124        VMUL        dTemp0, dTemp2, dCoeff5 ;// TeRi
125
126        VEXT        dSrcf, dSrcC0, dSrcC1, #5   ;// [f0 f1 f2 f3 ..]
127        VEXT        dSrcb, dSrcC0, dSrcC1, #1   ;// [b0 b1 b2 b3 ..]
128        VEXT        dSrc2c, dSrcC0, dSrcC1, #2
129        VEXT        dSrc2d, dSrcC0, dSrcC1, #3
130        VEXT        dSrce, dSrcC0, dSrcC1, #4
131        VADDL       qRes45, dSrcC0, dSrcf       ;// Acc=a+f
132
133        VSUB        dRes2, dRes2, dTemp0  ;// TeRi
134
135        VADDL       qTemp01, dSrc2c, dSrc2d     ;// c+d
136        VADDL       qTemp23, dSrcb, dSrce       ;// b+e
137
138        VLD1        qSrcD01, [pSrc], srcStep    ;// Load D register [a0 a1 a2 a3 ..]
139
140        VMLA        dRes4, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
141;        VMLS        dRes4, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
142        VMUL        dTemp0, dTemp2, dCoeff5      ;// Acc -= 5*(b+e) TeRi
143
144
145        VEXT        dSrcf, dSrcD0, dSrcD1, #5   ;// [f0 f1 f2 f3 ..]
146        VEXT        dSrcb, dSrcD0, dSrcD1, #1   ;// [b0 b1 b2 b3 ..]
147        VEXT        dSrc3c, dSrcD0, dSrcD1, #2
148        VEXT        dSrc3d, dSrcD0, dSrcD1, #3
149        VEXT        dSrce, dSrcD0, dSrcD1, #4
150        VADDL       qRes67, dSrcD0, dSrcf       ;// Acc=a+f
151
152        VSUB        dRes4, dRes4, dTemp0 ;// TeRi
153
154        VADDL       qTemp01, dSrc3c, dSrc3d     ;// c+d
155        VADDL       qTemp23, dSrcb, dSrce       ;// b+e
156        VMLA        dRes6, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
157        VMLS        dRes6, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
158
159        VQRSHRUN    dAcc0, qRes01, #5           ;// Acc = Sat ((Acc + 16) / 32)
160        VQRSHRUN    dAcc2, qRes23, #5           ;// Acc = Sat ((Acc + 16) / 32)
161        VQRSHRUN    dAcc4, qRes45, #5           ;// Acc = Sat ((Acc + 16) / 32)
162        VQRSHRUN    dAcc6, qRes67, #5           ;// Acc = Sat ((Acc + 16) / 32)
163
164        M_END
165
166    ENDIF
167
168
169    END
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229