armVCM4P10_DeblockingLuma_unsafe_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;//
3;// File Name:  armVCM4P10_DeblockingLuma_unsafe_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   9641
6;// Date:       Thursday, February 7, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13        INCLUDE omxtypes_s.h
14        INCLUDE armCOMM_s.h
15
16        M_VARIANTS ARM1136JS
17
18
19
20    IF  ARM1136JS
21
22MASK_1  EQU 0x01010101
23
24;// Declare input registers
25
26pQ0        RN 0
27StepArg    RN 1
28tC0Arg     RN 2
29alpha      RN 6
30
31beta       RN 14
32bS         RN 14
33tC0        RN 14
34ptC0       RN 1
35
36;// Declare Local/Temporary variables
37
38;// Pixels
39p_0     RN 3
40p_1     RN 5
41p_2     RN 4
42p_3     RN 2
43q_0     RN 8
44q_1     RN 9
45q_2     RN 10
46q_3     RN 12
47
48
49;// Filtering
50
51ap0q0   RN 1
52filt    RN 2
53
54m00     RN 7
55m01     RN 11
56
57apflg   RN 0
58aqflg   RN 6
59
60tC      RN 1
61
62
63;//Declarations for bSLT4 kernel
64
65pos     RN 7
66neg     RN 12
67
68P0a     RN 1
69P1a     RN 8
70Q0a     RN 7
71Q1a     RN 4
72
73u1      RN 3
74max     RN 12
75min     RN 2
76
77
78
79;//Declarations for bSGE4 kernel
80
81q_3b    RN 9
82p_3b    RN 0
83apqflg  RN 12
84
85P0b     RN 6
86P1b     RN 7
87P2b     RN 1
88
89Q0b     RN 9
90Q1b     RN 0
91Q2b     RN 2
92
93;// Miscellanous
94
95a       RN 0
96t0      RN 3
97t1      RN 12
98t2      RN 7
99t3      RN 11
100t4      RN 4
101t5      RN 1
102t8      RN 6
103t9      RN 14
104t10     RN 5
105t11     RN 9
106
107;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe()
108;//
109;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
110;//        - 2 - filt, 0 - apflg, 6 - aqflg
111;//        - 11 - m01, 7 - tC0
112;//
113;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a)
114;//
115;// Registers Corrupted - 0-3,5-12,14
116
117
118        M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr
119
120        ;// Since beta <= 18 and alpha <= 255 we know
121        ;// -254 <= p0-q0 <= 254
122        ;//  -17 <= q1-q0 <= 17
123        ;//  -17 <= p1-p0 <= 17
124
125        ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
126        ;//
127        ;//    Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
128        ;//                = (4*q0 - 4*p0 + p1 - q1 + 4)>>3
129        ;//                = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3
130
131        USUB8   t1, p_1, p_0
132        MUL     tC0, t2, m01
133
134        USUB8   t2, q_1, q_0
135        SSUB8   t1, t1, t2
136
137        USUB8   t2, p_0, q_0
138        AND     t2, t2, m01
139        SHSUB8  t1, t1, t2
140        UHSUB8  t5, p_0, q_0
141        SSUB8   t1, t1, t2
142        SHSUB8  t1, t1, t5
143        MOV     m00, #0
144        SADD8   t1, t1, m01
145        SHSUB8  t1, t1, t5
146
147        ;// tC = tC0
148        ;// if (ap < beta) tC++;
149        ;// if (aq < beta) tC++;
150        USUB8   t5, filt, m01
151        SEL     tC0, tC0, m00
152        UQADD8  tC, tC0, apflg
153        SSUB8   t1, t1, m00
154        UQADD8  tC, tC, aqflg
155
156        ;// Split into positive and negative part and clip
157        SEL     pos, t1, m00
158        USUB8   neg, pos, t1
159        USUB8   t3, pos, tC
160        SEL     pos, tC, pos
161        USUB8   t3, neg, tC
162        SEL     neg, tC, neg
163
164        ;//Reload m01
165        LDR     m01,=MASK_1
166
167        UQADD8  P0a, p_0, pos
168        UQSUB8  Q0a, q_0, pos
169        UQSUB8  P0a, P0a, neg
170        UQADD8  Q0a, Q0a, neg
171
172        ;// Choose to store the filtered
173        ;// value or the original pixel
174        USUB8   t1, filt, m01
175        SEL     P0a, P0a, p_0
176        SEL     Q0a, Q0a, q_0
177
178        ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
179        ;// u1 = (p0 + q0 + 1)>>1
180        ;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80
181        MVN     p_0, p_0
182        UHSUB8  u1, q_0, p_0
183        UQADD8  max, p_1, tC0
184        EOR     u1, u1, m01 ,LSL #7
185
186        ;// Calculate A = (p2+u1)>>1
187        ;// Then delta = Clip3( -tC0, tC0, A - p1)
188
189        ;// Clip P1
190        UHADD8  P1a, p_2, u1
191        UQSUB8  min, p_1, tC0
192        USUB8   t4, P1a, max
193        SEL     P1a, max, P1a
194        USUB8   t4, P1a, min
195        SEL     P1a, P1a, min
196
197        ;// Clip Q1
198        UHADD8  Q1a, q_2, u1
199        UQADD8  max, q_1, tC0
200        UQSUB8  min, q_1, tC0
201        USUB8   t0, Q1a, max
202        SEL     Q1a, max, Q1a
203        USUB8   t0, Q1a, min
204        SEL     Q1a, Q1a, min
205
206        ;// Choose to store the filtered
207        ;// value or the original pixel
208        USUB8   t0, apflg, m01
209        SEL     P1a, P1a, p_1
210        USUB8   t0, aqflg, m01
211        SEL     t3, Q1a, q_1
212
213        M_END
214
215;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
216;//
217;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
218;//        - 2 - filt, 0 - apflg,aqflg
219;//        - 1 - ap0q0, 6 - alpha
220;//        - 7 - m00, 11 - m01
221;//
222;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b)
223;//
224;// Registers Corrupted - 0-3,5-12,14
225
226        M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr
227
228        ;// apflg = apflg && |p0-q0|<((alpha>>2)+2)
229        ;// apflg = aqflg && |p0-q0|<((alpha>>2)+2)
230
231        M_ARG   pDummy,4
232        M_ARG   pQ_3,4
233        M_ARG   pP_3,4
234
235        UHADD8  alpha, alpha, m00
236        USUB8   t9, p_2, p_0    ;//t9 = dp2p0
237        UHADD8  alpha, alpha, m00
238        ADD     alpha, alpha, m01, LSL #1
239        USUB8   ap0q0, ap0q0, alpha
240        SEL     apqflg, m00, apflg
241
242        ;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
243        ;//    = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3
244        ;//    = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3)
245
246        ;// P1 = (p2 + p1 + q0 + p0 + 2)>>2
247        ;//    = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2)
248
249        ;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3
250        ;//    = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3
251        ;//    = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2)
252
253        ;// Compute P0b
254        USUB8   t2, p_0, q_0
255        SSUB8   t5, t9, t2
256
257        USUB8   t8, q_1, q_0
258        SHADD8  t8, t5, t8
259
260        USUB8   t9, p_1, p_0
261        SADD8   t8, t8, t9
262        SHSUB8  t8, t8, t2
263        SHADD8  t5, t5, t9
264        SHADD8  t8, t8, m01
265        SHADD8  t9, t5, m01
266        SADD8   P0b, p_0, t8
267        ;// P0b ready
268
269        ;// Compute P1b
270        M_LDR   p_3b, pP_3
271        SADD8   P1b, p_0, t9
272        ;// P1b ready
273
274        ;// Compute P2b
275        USUB8   t9, p_2, p_0
276        SADD8   t5, t5, t9
277        UHSUB8  t9, p_3b, p_0
278        EOR     a, p_3b, p_0
279        AND     a, a, m01
280        SHADD8  t5, t5, a
281        UHADD8  a, p_0, q_1
282        SADD8   t5, t5, m01
283        SHADD8  t5, t5, t9
284        MVN     t9, p_1
285        SADD8   P2b, p_0, t5
286        ;// P2b ready
287
288        UHSUB8  a, a, t9
289        ORR     t9, apqflg, m01
290        USUB8   t9, apqflg, t9
291
292        EOR     a, a, m01, LSL #7
293        SEL     P0b, P0b, a
294        SEL     P1b, P1b, p_1
295        SEL     P2b, P2b, p_2
296
297        USUB8   t4, filt, m01
298        SEL     P0b, P0b, p_0
299
300
301        ;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3
302        ;//    = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3
303        ;//    = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3)
304
305        ;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2
306        ;//    = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2)
307
308        ;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3
309        ;//    = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3
310        ;//    = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2)
311
312
313        ;// Compute Q0b Q1b
314        USUB8   t4, q_2, q_0
315        USUB8   a, p_0, q_0
316        USUB8   t9, p_1, p_0
317        SADD8   t0, t4, a
318        SHADD8  t9, t0, t9
319        UHADD8  t10, q_0, p_1
320        SADD8   t9, t9, a
321        USUB8   a, q_1, q_0
322        SHADD8  t9, t9, a
323        SHADD8  t0, t0, a
324        SHADD8  t9, t9, m01
325        SHADD8  a, t0, m01
326        SADD8   t9, q_0, t9
327        ;// Q0b ready - t9
328
329        MOV     t4, #0
330        UHADD8  apqflg, apqflg, t4
331
332        SADD8   Q1b, q_0, a
333        ;// Q1b ready
334
335        USUB8   t4, apqflg, m01
336        SEL     Q1b, Q1b, q_1
337        MVN     t11, q_1
338        UHSUB8  t10, t10, t11
339        M_LDR   q_3b, pQ_3
340        EOR     t10, t10, m01, LSL #7
341        SEL     t9, t9, t10
342
343        ;// Compute Q2b
344        USUB8   t4, q_2, q_0
345        SADD8   t4, t0, t4
346        EOR     t0, q_3b, q_0
347        AND     t0, t0, m01
348        SHADD8  t4, t4, t0
349        UHSUB8  t10, q_3b, q_0
350        SADD8   t4, t4, m01
351        SHADD8  t4, t4, t10
352
353        USUB8   t10, filt, m01
354        SEL     Q0b, t9, q_0
355
356        SADD8   t4, q_0, t4
357        ;// Q2b ready - t4
358
359        USUB8   t10, apqflg, m01
360        SEL     Q2b, t4, q_2
361
362        M_END
363
364    ENDIF
365
366        END