1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_DeblockingLuma_unsafe_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   9641
21;// Date:       Thursday, February 7, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS ARM1136JS
31
32
33
34    IF  ARM1136JS
35
36MASK_1  EQU 0x01010101
37
38;// Declare input registers
39
40pQ0        RN 0
41StepArg    RN 1
42tC0Arg     RN 2
43alpha      RN 6
44
45beta       RN 14
46bS         RN 14
47tC0        RN 14
48ptC0       RN 1
49
50;// Declare Local/Temporary variables
51
52;// Pixels
53p_0     RN 3
54p_1     RN 5
55p_2     RN 4
56p_3     RN 2
57q_0     RN 8
58q_1     RN 9
59q_2     RN 10
60q_3     RN 12
61
62
63;// Filtering
64
65ap0q0   RN 1
66filt    RN 2
67
68m00     RN 7
69m01     RN 11
70
71apflg   RN 0
72aqflg   RN 6
73
74tC      RN 1
75
76
77;//Declarations for bSLT4 kernel
78
79pos     RN 7
80neg     RN 12
81
82P0a     RN 1
83P1a     RN 8
84Q0a     RN 7
85Q1a     RN 4
86
87u1      RN 3
88max     RN 12
89min     RN 2
90
91
92
93;//Declarations for bSGE4 kernel
94
95q_3b    RN 9
96p_3b    RN 0
97apqflg  RN 12
98
99P0b     RN 6
100P1b     RN 7
101P2b     RN 1
102
103Q0b     RN 9
104Q1b     RN 0
105Q2b     RN 2
106
107;// Miscellanous
108
109a       RN 0
110t0      RN 3
111t1      RN 12
112t2      RN 7
113t3      RN 11
114t4      RN 4
115t5      RN 1
116t8      RN 6
117t9      RN 14
118t10     RN 5
119t11     RN 9
120
121;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe()
122;//
123;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
124;//        - 2 - filt, 0 - apflg, 6 - aqflg
125;//        - 11 - m01, 7 - tC0
126;//
127;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a)
128;//
129;// Registers Corrupted - 0-3,5-12,14
130
131
132        M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr
133
134        ;// Since beta <= 18 and alpha <= 255 we know
135        ;// -254 <= p0-q0 <= 254
136        ;//  -17 <= q1-q0 <= 17
137        ;//  -17 <= p1-p0 <= 17
138
139        ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
140        ;//
141        ;//    Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
142        ;//                = (4*q0 - 4*p0 + p1 - q1 + 4)>>3
143        ;//                = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3
144
145        USUB8   t1, p_1, p_0
146        MUL     tC0, t2, m01
147
148        USUB8   t2, q_1, q_0
149        SSUB8   t1, t1, t2
150
151        USUB8   t2, p_0, q_0
152        AND     t2, t2, m01
153        SHSUB8  t1, t1, t2
154        UHSUB8  t5, p_0, q_0
155        SSUB8   t1, t1, t2
156        SHSUB8  t1, t1, t5
157        MOV     m00, #0
158        SADD8   t1, t1, m01
159        SHSUB8  t1, t1, t5
160
161        ;// tC = tC0
162        ;// if (ap < beta) tC++;
163        ;// if (aq < beta) tC++;
164        USUB8   t5, filt, m01
165        SEL     tC0, tC0, m00
166        UQADD8  tC, tC0, apflg
167        SSUB8   t1, t1, m00
168        UQADD8  tC, tC, aqflg
169
170        ;// Split into positive and negative part and clip
171        SEL     pos, t1, m00
172        USUB8   neg, pos, t1
173        USUB8   t3, pos, tC
174        SEL     pos, tC, pos
175        USUB8   t3, neg, tC
176        SEL     neg, tC, neg
177
178        ;//Reload m01
179        LDR     m01,=MASK_1
180
181        UQADD8  P0a, p_0, pos
182        UQSUB8  Q0a, q_0, pos
183        UQSUB8  P0a, P0a, neg
184        UQADD8  Q0a, Q0a, neg
185
186        ;// Choose to store the filtered
187        ;// value or the original pixel
188        USUB8   t1, filt, m01
189        SEL     P0a, P0a, p_0
190        SEL     Q0a, Q0a, q_0
191
192        ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
193        ;// u1 = (p0 + q0 + 1)>>1
194        ;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80
195        MVN     p_0, p_0
196        UHSUB8  u1, q_0, p_0
197        UQADD8  max, p_1, tC0
198        EOR     u1, u1, m01 ,LSL #7
199
200        ;// Calculate A = (p2+u1)>>1
201        ;// Then delta = Clip3( -tC0, tC0, A - p1)
202
203        ;// Clip P1
204        UHADD8  P1a, p_2, u1
205        UQSUB8  min, p_1, tC0
206        USUB8   t4, P1a, max
207        SEL     P1a, max, P1a
208        USUB8   t4, P1a, min
209        SEL     P1a, P1a, min
210
211        ;// Clip Q1
212        UHADD8  Q1a, q_2, u1
213        UQADD8  max, q_1, tC0
214        UQSUB8  min, q_1, tC0
215        USUB8   t0, Q1a, max
216        SEL     Q1a, max, Q1a
217        USUB8   t0, Q1a, min
218        SEL     Q1a, Q1a, min
219
220        ;// Choose to store the filtered
221        ;// value or the original pixel
222        USUB8   t0, apflg, m01
223        SEL     P1a, P1a, p_1
224        USUB8   t0, aqflg, m01
225        SEL     t3, Q1a, q_1
226
227        M_END
228
229;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
230;//
231;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
232;//        - 2 - filt, 0 - apflg,aqflg
233;//        - 1 - ap0q0, 6 - alpha
234;//        - 7 - m00, 11 - m01
235;//
236;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b)
237;//
238;// Registers Corrupted - 0-3,5-12,14
239
240        M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr
241
242        ;// apflg = apflg && |p0-q0|<((alpha>>2)+2)
243        ;// apflg = aqflg && |p0-q0|<((alpha>>2)+2)
244
245        M_ARG   pDummy,4
246        M_ARG   pQ_3,4
247        M_ARG   pP_3,4
248
249        UHADD8  alpha, alpha, m00
250        USUB8   t9, p_2, p_0    ;//t9 = dp2p0
251        UHADD8  alpha, alpha, m00
252        ADD     alpha, alpha, m01, LSL #1
253        USUB8   ap0q0, ap0q0, alpha
254        SEL     apqflg, m00, apflg
255
256        ;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
257        ;//    = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3
258        ;//    = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3)
259
260        ;// P1 = (p2 + p1 + q0 + p0 + 2)>>2
261        ;//    = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2)
262
263        ;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3
264        ;//    = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3
265        ;//    = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2)
266
267        ;// Compute P0b
268        USUB8   t2, p_0, q_0
269        SSUB8   t5, t9, t2
270
271        USUB8   t8, q_1, q_0
272        SHADD8  t8, t5, t8
273
274        USUB8   t9, p_1, p_0
275        SADD8   t8, t8, t9
276        SHSUB8  t8, t8, t2
277        SHADD8  t5, t5, t9
278        SHADD8  t8, t8, m01
279        SHADD8  t9, t5, m01
280        SADD8   P0b, p_0, t8
281        ;// P0b ready
282
283        ;// Compute P1b
284        M_LDR   p_3b, pP_3
285        SADD8   P1b, p_0, t9
286        ;// P1b ready
287
288        ;// Compute P2b
289        USUB8   t9, p_2, p_0
290        SADD8   t5, t5, t9
291        UHSUB8  t9, p_3b, p_0
292        EOR     a, p_3b, p_0
293        AND     a, a, m01
294        SHADD8  t5, t5, a
295        UHADD8  a, p_0, q_1
296        SADD8   t5, t5, m01
297        SHADD8  t5, t5, t9
298        MVN     t9, p_1
299        SADD8   P2b, p_0, t5
300        ;// P2b ready
301
302        UHSUB8  a, a, t9
303        ORR     t9, apqflg, m01
304        USUB8   t9, apqflg, t9
305
306        EOR     a, a, m01, LSL #7
307        SEL     P0b, P0b, a
308        SEL     P1b, P1b, p_1
309        SEL     P2b, P2b, p_2
310
311        USUB8   t4, filt, m01
312        SEL     P0b, P0b, p_0
313
314
315        ;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3
316        ;//    = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3
317        ;//    = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3)
318
319        ;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2
320        ;//    = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2)
321
322        ;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3
323        ;//    = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3
324        ;//    = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2)
325
326
327        ;// Compute Q0b Q1b
328        USUB8   t4, q_2, q_0
329        USUB8   a, p_0, q_0
330        USUB8   t9, p_1, p_0
331        SADD8   t0, t4, a
332        SHADD8  t9, t0, t9
333        UHADD8  t10, q_0, p_1
334        SADD8   t9, t9, a
335        USUB8   a, q_1, q_0
336        SHADD8  t9, t9, a
337        SHADD8  t0, t0, a
338        SHADD8  t9, t9, m01
339        SHADD8  a, t0, m01
340        SADD8   t9, q_0, t9
341        ;// Q0b ready - t9
342
343        MOV     t4, #0
344        UHADD8  apqflg, apqflg, t4
345
346        SADD8   Q1b, q_0, a
347        ;// Q1b ready
348
349        USUB8   t4, apqflg, m01
350        SEL     Q1b, Q1b, q_1
351        MVN     t11, q_1
352        UHSUB8  t10, t10, t11
353        M_LDR   q_3b, pQ_3
354        EOR     t10, t10, m01, LSL #7
355        SEL     t9, t9, t10
356
357        ;// Compute Q2b
358        USUB8   t4, q_2, q_0
359        SADD8   t4, t0, t4
360        EOR     t0, q_3b, q_0
361        AND     t0, t0, m01
362        SHADD8  t4, t4, t0
363        UHSUB8  t10, q_3b, q_0
364        SADD8   t4, t4, m01
365        SHADD8  t4, t4, t10
366
367        USUB8   t10, filt, m01
368        SEL     Q0b, t9, q_0
369
370        SADD8   t4, q_0, t4
371        ;// Q2b ready - t4
372
373        USUB8   t10, apqflg, m01
374        SEL     Q2b, t4, q_2
375
376        M_END
377
378    ENDIF
379
380        END
381