armVCM4P10_DeblockingLuma_unsafe_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_DeblockingLuma_unsafe_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS CortexA8
31
32
33    IF  CortexA8
34
35pThresholds RN 5
36
37;// Pixels
38dP_0        DN D4.U8
39dP_1        DN D5.U8
40dP_2        DN D6.U8
41dP_3        DN D7.U8
42dQ_0        DN D8.U8
43dQ_1        DN D9.U8
44dQ_2        DN D10.U8
45dQ_3        DN D11.U8
46
47
48;// Filtering Decision
49dAlpha      DN D0.U8
50
51dFilt       DN D16.U8
52dAqflg      DN D12.U8
53dApflg      DN D17.U8
54
55dAp0q0      DN D13.U8
56
57;// bSLT4
58dTC0        DN D18.U8
59dTC1        DN D19.U8
60dTC01       DN D18.U8
61
62dTCs        DN D31.S8
63dTC         DN D31.U8
64
65dMask_0     DN D14.U8
66dMask_1     DN D15.U8
67
68dTemp       DN D19.U8
69
70;// Computing P0,Q0
71qDq0p0      QN Q10.S16
72qDp1q1      QN Q11.S16
73qDelta      QN Q10.S16  ; reuse qDq0p0
74dDelta      DN D20.S8
75
76
77;// Computing P1,Q1
78dRp0q0      DN D24.U8
79
80dMaxP       DN D23.U8
81dMinP       DN D22.U8
82
83dMaxQ       DN D19.U8
84dMinQ       DN D21.U8
85
86dDeltaP     DN D26.U8
87dDeltaQ     DN D27.U8
88
89qP_0n       QN Q14.S16
90qQ_0n       QN Q12.S16
91
92dQ_0n       DN D24.U8
93dQ_1n       DN D25.U8
94dP_0n       DN D29.U8
95dP_1n       DN D30.U8
96
97;// bSGE4
98
99qSp0q0      QN Q10.U16
100
101qSp2q1      QN Q11.U16
102qSp0q0p1    QN Q12.U16
103qSp3p2      QN Q13.U16
104dHSp0q1     DN D28.U8
105
106qSq2p1      QN Q11.U16
107qSp0q0q1    QN Q12.U16
108qSq3q2      QN Q13.U16  ;!!
109dHSq0p1     DN D28.U8   ;!!
110
111qTemp1      QN Q11.U16  ;!!;qSp2q1
112qTemp2      QN Q12.U16  ;!!;qSp0q0p1
113
114dP_0t       DN D28.U8   ;!!;dHSp0q1
115dQ_0t       DN D22.U8   ;!!;Temp1
116
117dP_0n       DN D29.U8
118dP_1n       DN D30.U8
119dP_2n       DN D31.U8
120
121dQ_0n       DN D24.U8   ;!!;Temp2
122dQ_1n       DN D25.U8   ;!!;Temp2
123dQ_2n       DN D28.U8   ;!!;dQ_0t
124
125;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe
126;//
127;// Inputs - Pixels             - p0-p3: D4-D7, q0-q3: D8-D11
128;//        - Filter masks       - filt: D16, aqflg: D12, apflg: D17
129;//        - Additional Params  - pThresholds: r5
130;//
131;// Outputs - Pixels            - P0-P1: D29-D30, Q0-Q1: D24-D25
132;//         - Additional Params - pThresholds: r5
133
134;// Registers Corrupted         - D18-D31
135
136
137        M_START armVCM4P10_DeblockingLumabSLT4_unsafe
138
139
140        ;// qDq0p0-10
141        VSUBL       qDp1q1, dP_1, dQ_1
142        VLD1        {dTC0[]}, [pThresholds]!
143        ;// qDp1q1-11
144        VSUBL       qDq0p0, dQ_0, dP_0
145        VLD1        {dTC1[]}, [pThresholds]!
146
147        ;// dRp0q0-24
148        VSHR        qDp1q1, qDp1q1, #2
149
150        ;// dTC01 = (dTC1 << 4) | dTC0
151        ;// dTC01-18
152        VEXT        dTC01, dTC0, dTC1, #4
153        ;// dTemp-19
154        VAND        dTemp, dApflg, dMask_1
155
156        VBIF        dTC01, dMask_0, dFilt
157
158
159        ;// delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3;
160        ;// dDelta = (qDp1q1 >> 2 + qDq0p0 + 1)>> 1
161
162        ;// qDelta-qDq0p0-10
163        VRHADD      qDelta, qDp1q1, qDq0p0
164        VRHADD      dRp0q0, dP_0, dQ_0
165        VADD        dTC, dTC01, dTemp
166
167        ;// dTC = dTC01 + (dAplg & 1) + (dAqflg & 1)
168
169        VAND        dTemp, dAqflg, dMask_1
170        VQADD       dMaxP, dP_1, dTC01
171        VQMOVN      dDelta, qDelta
172        VADD        dTC, dTC, dTemp
173
174        ;// dMaxP = QADD(dP_1, dTC01)
175        ;// dMinP = QSUB(dP_1, dTC01)
176
177        ;// dMaxP-d23
178        ;// dMinP-d22
179        VQSUB       dMinP, dP_1, dTC01
180
181        ;// dDelta-d20
182
183        ;// dMaxQ = QADD(dQ_1, dTC01)
184        ;// dMinQ = QSUB(dQ_1, dTC01)
185
186        ;// dMaxQ-19
187        ;// dMinQ-21
188        VQADD       dMaxQ, dQ_1, dTC01
189        VHADD       dDeltaP, dRp0q0, dP_2
190        VMIN        dDelta, dDelta, dTCs
191
192        ;// dDelta = (OMX_U8)armClip(0, 255, q0 - delta);
193        VNEG        dTCs, dTCs
194
195        VQSUB       dMinQ, dQ_1, dTC01
196
197        ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
198        ;// delta = armClip(-tC0, tC0, delta);
199        ;// pQ0[-2*Step] = (OMX_U8)(p1 + delta);
200
201        ;// dDeltaP = (dP_2 + dRp0q0)>>1;
202        ;// dP_1n = armClip(dP_1 - dTC01, dP_1 + dTC01, dDeltaP);
203        ;// dP_1n = armClip(MinP, MaxP, dDeltaP);
204
205        ;// delta = (q2 + ((p0+q0+1)>>1) - (q1<<1))>>1;
206        ;// delta = armClip(-tC0, tC0, delta);
207        ;// pQ0[1*Step] = (OMX_U8)(q1 + delta);
208
209        ;// dDeltaQ = (dQ_2 + dRp0q0)>>1;
210        ;// dQ_1n = armClip(dQ_1 - dTC01, dQ_1 + dTC01, dDeltaQ);
211        ;// dQ_1n = armClip(MinQ, MaxQ, dDeltaQ);
212
213        ;// dDeltaP-26
214        VHADD       dDeltaQ, dRp0q0, dQ_2
215
216        ;// dDeltaQ-27
217
218        ;// dP_0n - 29
219        ;// dP_1n - 30
220        ;// dQ_0n - 24
221        ;// dQ_1n - 25
222
223        ;// delta = (q2 + ((p0+q0+1)>>1) - (q1<<1))>>1;
224        ;// dDeltaQ = (dQ_2 + dRp0q0)>>1;
225
226        VMAX        dP_1n, dDeltaP, dMinP
227        VMAX        dDelta, dDelta, dTCs
228
229        ;// pQ0[-1*Step] = (OMX_U8)armClip(0, 255, dP_0 - delta);
230        ;// pQ0[0*Step] = (OMX_U8)armClip(0, 255, dQ_0 - delta);
231
232        ;// dP_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
233        ;// dQ_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
234
235        ;// qP_0n - 14
236        ;// qQ_0n - 12
237
238        VMOVL       qP_0n, dP_0
239        VMOVL       qQ_0n, dQ_0
240
241        VADDW       qP_0n, qP_0n, dDelta
242        VSUBW       qQ_0n, qQ_0n, dDelta
243
244        VQMOVUN     dP_0n, qP_0n
245        VQMOVUN     dQ_0n, qQ_0n
246
247        VMAX        dQ_1n, dDeltaQ, dMinQ
248
249        VMIN        dP_1n, dP_1n, dMaxP
250        VMIN        dQ_1n, dQ_1n, dMaxQ
251        VBIF        dP_0n, dP_0, dFilt
252
253        VBIF        dP_1n, dP_1, dApflg
254        VBIF        dQ_0n, dQ_0, dFilt
255        VBIF        dQ_1n, dQ_1, dAqflg
256
257        M_END
258
259;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
260;//
261;// Inputs - Pixels             - p0-p3: D4-D7, q0-q3: D8-D11
262;//        - Filter masks       - filt: D16, aqflg: D12, apflg: D17
263;//        - Additional Params  - alpha: D0, dMask_1: D15
264;//
265;// Outputs - Pixels            - P0-P2: D29-D31, Q0-Q2: D24,D25,D28
266
267;// Registers Corrupted         - D18-D31
268
269        M_START armVCM4P10_DeblockingLumabSGE4_unsafe
270
271
272        ;// ap<beta && armAbs(p0-q0)<((alpha>>2)+2)
273        ;// aq<beta && armAbs(p0-q0)<((alpha>>2)+2)
274
275        ;// ( dApflg & dAp0q0 < (dAlpha >> 2 + 2) )
276        ;// ( dAqflg & dAp0q0 < (dAlpha >> 2 + 2) )
277
278        ;// ( dApflg = dApflg & dAp0q0 < (dTemp + dMask_1 + dMask_1) )
279        ;// ( dAqflg = dAqflg & dAp0q0 < (dTemp + dMask_1 + dMask_1) )
280
281        ;// P Filter
282
283        VSHR        dTemp, dAlpha, #2
284        VADD        dTemp, dTemp, dMask_1
285
286        ;// qSp0q0-10
287        VADDL       qSp0q0, dQ_0, dP_0
288        VADD        dTemp, dTemp, dMask_1
289
290        ;// qSp2q1-11
291        ;// qSp0q0p1-12
292        VADDL       qSp2q1, dP_2, dQ_1
293        VADDW       qSp0q0p1, qSp0q0, dP_1
294
295        VCGT        dTemp, dTemp, dAp0q0
296        VSHR        qSp2q1, #1
297
298        ;// pQ0[-1*Step] = (OMX_U8)((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3);
299        ;// pQ0[-1*Step] = ( ( (p0 + q0 + p1) + (p2 + q1)>>1 ) >> 1 + 1 ) >> 1
300
301        ;// dP_0n = ( ( (qSp0q0 + dP_1) + qSp2q1>>1 ) >> 1 + 1 ) >> 1
302        ;// dP_0n = ( ( qSp0q0p1 + qSp2q1>>1 ) >> 1 + 1 ) >> 1
303        ;// dP_0n = ( qTemp1 + 1 ) >> 1
304
305        ;// pQ0[-2*Step] = (OMX_U8)((p2 + p1 + p0 + q0 + 2)>>2);
306
307        ;// dP_1n = (OMX_U8)((dP_2 + qSp0q0p1 + 2)>>2);
308        ;// dP_1n = (OMX_U8)((qTemp2 + 2)>>2);
309
310        ;// pQ0[-3*Step] = (OMX_U8)((2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3);
311        ;// pQ0[-3*Step] = (OMX_U8)(( (p3 + p2) + (p1 + p0 + q0 + p2) >> 1 + 2)>>2);
312
313        ;// dP_2n = (OMX_U8)(( qSp3p2 + (dP_2 + qSp0q0p1) >> 1 + 2) >> 2);
314        ;// dP_2n = (OMX_U8)(( qSp3p2 + qTemp2 >> 1 + 2) >> 2);
315
316        ;// qTemp1-qSp2q1-11
317        ;// qTemp2-qSp0q0p1-12
318        VHADD       qTemp1, qSp0q0p1, qSp2q1
319        VADDW       qTemp2, qSp0q0p1, dP_2
320
321        ;// qSp3p2-13
322        VADDL       qSp3p2, dP_3, dP_2
323
324        VAND        dApflg, dApflg, dTemp
325        VHADD       dHSp0q1, dP_0, dQ_1
326        VSRA        qSp3p2, qTemp2, #1
327        ;// dHSp0q1-28
328        VAND        dAqflg, dAqflg, dTemp
329
330        ;// dP_0n-29
331        ;// dP_0t-dHSp0q1-28
332        VQRSHRN     dP_0n, qTemp1, #1
333        VRHADD      dP_0t, dHSp0q1, dP_1
334
335        ;// dP_1n-30
336        VQRSHRN     dP_1n, qTemp2, #2
337
338        VADDL       qSq2p1, dQ_2, dP_1
339        VADDW       qSp0q0q1, qSp0q0, dQ_1
340
341        VBIF        dP_0n, dP_0t, dApflg
342
343        ;// Q Filter
344
345        ;// pQ0[0*Step] = (OMX_U8)((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3);
346        ;// pQ0[0*Step] = ( ( (p0 + q0 + q1) + (q2 + p1)>>1 ) >> 1 + 1 ) >> 1
347
348        ;// dQ_0n = ( ( (qSp0q0 + dQ_1) + qSq2p1>>1 ) >> 1 + 1 ) >> 1
349        ;// dQ_0n = ( ( qSp0q0q1 + qSq2p1>>1 ) >> 1 + 1 ) >> 1
350        ;// dQ_0n = ( qTemp1 + 1 ) >> 1
351
352        ;// pQ0[1*Step] = (OMX_U8)((q2 + q1 + q0 + q0 + 2)>>2);
353
354        ;// dQ_1n = (OMX_U8)((dQ_2 + qSp0q0q1 + 2)>>2);
355        ;// dQ_1n = (OMX_U8)((qTemp2 + 2)>>2);
356
357        ;// pQ0[2*Step] = (OMX_U8)((2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3);
358        ;// pQ0[2*Step] = (OMX_U8)(( (q3 + q2) + (q1 + p0 + q0 + q2) >> 1 + 2)>>2);
359
360        ;// dQ_2n = (OMX_U8)(( qSq3q2 + (dQ_2 + qSp0q0q1) >> 1 + 2) >> 2);
361        ;// dQ_2n = (OMX_U8)(( qSq3q2 + qTemp2 >> 1 + 2) >> 2);
362
363        ;// qTemp1-qSp2q1-11
364        ;// qTemp2-qSp0q0p1-12
365        ;// qSq2p1-11
366        ;// qSp0q0q1-12
367
368
369        ;// qTemp2-qSp0q0p1-12
370        ;// qTemp1-qSq2p1-11
371        ;// qSq3q2-13
372        ;// dP_2n-31
373
374        VQRSHRN     dP_2n, qSp3p2, #2
375        VADDL       qSq3q2, dQ_3, dQ_2
376
377        VSHR        qSq2p1, #1
378
379        VHADD       qTemp1, qSp0q0q1, qSq2p1
380        VADDW       qTemp2, qSp0q0q1, dQ_2
381
382        ;// dHSq0p1-28
383        VHADD       dHSq0p1, dQ_0, dP_1
384
385        VBIF        dP_0n, dP_0, dFilt
386        VBIF        dP_1n, dP_1, dApflg
387
388        VSRA        qSq3q2, qTemp2, #1
389
390        ;// dQ_1-Temp2-25
391        ;// dQ_0-Temp2-24
392        VQRSHRN     dQ_1n, qTemp2, #2
393        VQRSHRN     dQ_0n, qTemp1, #1
394
395        ;// dQ_0t-Temp1-22
396        VRHADD      dQ_0t, dHSq0p1, dQ_1
397        VBIF        dQ_1n, dQ_1, dAqflg
398
399        VBIF        dP_2n, dP_2, dApflg
400        VBIF        dQ_0n, dQ_0t, dAqflg
401        VQRSHRN     dQ_2n, qSq3q2, #2
402        VBIF        dQ_0n, dQ_0, dFilt
403        VBIF        dQ_2n, dQ_2, dAqflg
404
405        M_END
406
407    ENDIF
408
409
410        END
411