omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;//
3;// File Name:  omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   9641
6;// Date:       Thursday, February 7, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13        INCLUDE omxtypes_s.h
14        INCLUDE armCOMM_s.h
15
16        M_VARIANTS ARM1136JS
17
18
19        IF ARM1136JS
20
21
22MASK_0      EQU 0x00000000
23MASK_1      EQU 0x01010101
24MASK_2      EQU 0x0000ff00
25LOOP_COUNT  EQU 0x50000000
26
27;// Declare input registers
28
29pSrcDst     RN 0
30srcdstStep  RN 1
31pAlphaArg   RN 2
32pBetaArg    RN 3
33
34pThresholds RN 6
35pBS         RN 9
36pQ0         RN 0
37bS          RN 2
38bSTemp      RN 10
39
40alpha       RN 6
41alpha0      RN 6
42alpha1      RN 8
43
44beta        RN 7
45beta0       RN 7
46beta1       RN 9
47
48;// Declare Local/Temporary variables
49
50;// Pixels
51p_0         RN 3
52p_1         RN 5
53q_0         RN 8
54q_1         RN 9
55
56;// Unpacking
57mask        RN 11
58
59row0        RN 2
60row1        RN 4
61row2        RN 5
62row3        RN 3
63
64row4        RN 8
65row5        RN 9
66row6        RN 10
67row7        RN 12
68
69tunpk0      RN 2
70tunpk2      RN 10
71tunpk3      RN 12
72
73tunpk4      RN 4
74tunpk5      RN 5
75tunpk6      RN 14
76tunpk7      RN 2
77
78;// Filtering
79
80dp0q0       RN 12
81dp1p0       RN 12
82dq1q0       RN 12
83
84ap0q0       RN 4
85filt        RN 2
86
87m00         RN 14
88m01         RN 11
89
90pQ0         RN 0
91Step        RN 1
92
93;// Output
94
95P_0         RN 6
96Q_0         RN 7
97
98;//Declarations for bSLT4 kernel
99
100tC          RN 12
101tC0         RN 5
102tC1         RN 12
103pos         RN 5
104neg         RN 9
105
106;//Declarations for bSGE4 kernel
107
108
109;// Miscellanous
110XY          RN 8
111
112a           RN 10
113t1          RN 10
114t2          RN 12
115t3          RN 14
116t4          RN 6
117t5          RN 5
118
119
120        ;// Allocate stack memory
121        M_ALLOC4 ppThresholds,4
122        M_ALLOC8 pAlphaBeta0,8
123        M_ALLOC8 pAlphaBeta1,8
124        M_ALLOC8 pXYBS,4
125        M_ALLOC4 ppBS,4
126
127        ;// Function header
128        M_START omxVCM4P10_FilterDeblockingChroma_VerEdge_I, r11
129
130        ;//Input arguments on the stack
131        M_ARG   ppThresholdsArg, 4
132        M_ARG   ppBSArg, 4
133
134        LDRB    alpha1, [pAlphaArg,#1]
135        LDRB    beta1,  [pBetaArg,#1]
136        M_LDR   pThresholds, ppThresholdsArg
137        LDR     a,=MASK_1
138        LDRB    beta0,  [pBetaArg]
139        M_STR   pThresholds, ppThresholds
140        LDRB    alpha0, [pAlphaArg]
141
142        MUL     alpha1, alpha1, a
143        MUL     beta1, beta1, a
144        MUL     alpha0, alpha0, a
145        MUL     beta0, beta0, a
146
147        M_STRD  alpha1, beta1, pAlphaBeta1
148        M_LDR   pBS, ppBSArg
149        M_STRD  alpha0, beta0, pAlphaBeta0
150
151        LDR     XY,=LOOP_COUNT
152        M_STRD  XY, pBS, pXYBS
153
154
155LoopY
156LoopX
157;//---------------Load Pixels-------------------
158
159;//----------------Pack q0-q1-----------------------
160        LDRH    bS, [pBS], #8
161        LDR     mask, =MASK_2
162
163        M_LDRH  row4, [pQ0], srcdstStep
164        CMP     bS, #0
165        M_STR   pBS, ppBS
166        M_LDRH  row5, [pQ0], srcdstStep
167        BEQ.W   NoFilterBS0
168        LDRH    row6, [pQ0]
169        LDRH    row7, [pQ0, srcdstStep]
170
171        ;// row4 = [0 0 r0q0 r0q1]
172        ;// row5 = [0 0 r1q0 r1q1]
173        ;// row6 = [0 0 r2q0 r2q1]
174        ;// row7 = [0 0 r3q0 r3q1]
175
176        AND     tunpk4, mask, row4
177        AND     tunpk5, mask, row4, LSL#8
178        UXTAB   tunpk4, tunpk4, row5, ROR#8
179        UXTAB   tunpk5, tunpk5, row5
180        AND     tunpk6, mask, row6
181        AND     tunpk7, mask, row6, LSL#8
182        UXTAB   tunpk6, tunpk6, row7, ROR#8
183        UXTAB   tunpk7, tunpk7, row7
184
185        ;// tunpk4 = [0 0 r0q0 r1q0]
186        ;// tunpk5 = [0 0 r0q1 r1q1]
187        ;// tunpk6 = [0 0 r2q0 r3q0]
188        ;// tunpk7 = [0 0 r2q1 r3q1]
189
190        SUB     pQ0, pQ0, srcdstStep, LSL #1
191        SUB     pQ0, pQ0, #2
192
193        PKHBT   q_1, tunpk6, tunpk4, LSL#16
194        PKHBT   q_0, tunpk7, tunpk5, LSL#16
195
196        ;// q_0 = [r0q0 r1q0 r2q0 r3q0]
197        ;// q_1 = [r0q1 r1q1 r2q1 r3q1]
198
199
200;//----------------Pack p0-p1-----------------------
201
202        M_LDRH  row0, [pQ0], srcdstStep
203        M_LDRH  row1, [pQ0], srcdstStep
204        LDRH    row2, [pQ0]
205        LDRH    row3, [pQ0, srcdstStep]
206
207        ;// row0 = [0 0 r0p0 r0p1]
208        ;// row1 = [0 0 r1p0 r1p1]
209        ;// row2 = [0 0 r2p0 r2p1]
210        ;// row3 = [0 0 r3p0 r3p1]
211
212        AND     tunpk2, mask, row0
213        AND     tunpk6, mask, row0, LSL#8
214        UXTAB   tunpk2, tunpk2, row1, ROR#8
215        UXTAB   tunpk6, tunpk6, row1
216
217        AND     tunpk0, mask, row2
218        AND     tunpk3, mask, row2, LSL#8
219        UXTAB   tunpk0, tunpk0, row3, ROR#8
220        UXTAB   tunpk3, tunpk3, row3
221
222        ;// tunpk2 = [0 0 r0p0 r1p0]
223        ;// tunpk6 = [0 0 r0p1 r1p1]
224        ;// tunpk0 = [0 0 r2p0 r3p0]
225        ;// tunpk3 = [0 0 r2p1 r3p1]
226
227        PKHBT   p_0, tunpk0, tunpk2, LSL#16
228        M_LDR   bSTemp, ppBS
229        PKHBT   p_1, tunpk3, tunpk6, LSL#16
230
231        ;// p_0 = [r0p0 r1p0 r2p0 r3p0]
232        ;// p_1 = [r0p1 r1p1 r2p1 r3p1]
233
234;//--------------Filtering Decision -------------------
235        USUB8   dp0q0, p_0, q_0
236        LDR     m01, =MASK_1
237        LDRH    bSTemp, [bSTemp ,#-8]
238        MOV     m00, #MASK_0                ;//  00000000 mask
239
240        MOV     filt, m01
241        TST     bSTemp, #0xff00
242        MOVEQ   filt, filt, LSL #16
243        TST     bSTemp, #0xff
244        MOVEQ   filt, filt, LSR #16
245        TST     bSTemp, #4
246
247        ;// Check |p0-q0|<Alpha
248        USUB8   a, q_0, p_0
249        SEL     ap0q0, a, dp0q0
250        USUB8   a, ap0q0, alpha
251        SEL     filt, m00, filt
252
253        ;// Check |p1-p0|<Beta
254        USUB8   dp1p0, p_1, p_0
255        USUB8   a, p_0, p_1
256        SEL     a, a, dp1p0
257        USUB8   a, a, beta
258        SEL     filt, m00, filt
259
260        ;// Check |q1-q0|<Beta
261        USUB8   dq1q0, q_1, q_0
262        USUB8   a, q_0, q_1
263        SEL     a, a, dq1q0
264        USUB8   a, a, beta
265        SEL     filt, m00, filt
266
267        BEQ     bSLT4
268;//-------------------Filter--------------------
269bSGE4
270        ;//---------bSGE4 Execution---------------
271        CMP     filt, #0
272
273        M_LDR   pThresholds, ppThresholds
274
275        ;// Compute P0b
276        UHADD8  t1, p_0, q_1
277        BEQ     NoFilterFilt0
278        MVN     t2, p_1
279        UHSUB8  t1, t1, t2
280        USUB8   t2, filt, m01
281        EOR     t1, t1, m01, LSL #7
282
283        ADD     pThresholds,pThresholds, #4
284
285        ;// Compute Q0b
286        UHADD8  t2, q_0, p_1
287        MVN     t3, q_1
288        UHSUB8  t2, t2, t3
289        M_STR   pThresholds, ppThresholds
290        SEL     P_0, t1, p_0
291        EOR     t2, t2, m01, LSL #7
292        SEL     Q_0, t2, q_0
293
294        B       StoreResultAndExit
295
296;//---------- Exit of LoopX --------------
297;//---- for the case of no filtering -----
298
299NoFilterFilt0
300        ADD     pQ0, pQ0, #2
301NoFilterBS0
302        M_LDR   pThresholds, ppThresholds
303        SUB     pQ0, pQ0, srcdstStep, LSL #1
304        ADD     pQ0, pQ0, #4
305        ADD     pThresholds, pThresholds, #4
306        ;// Load counter for LoopX
307        M_LDRD  XY, pBS, pXYBS
308        M_STR   pThresholds, ppThresholds
309        M_LDRD  alpha, beta, pAlphaBeta1
310
311        ;// Align the pointer
312        ADDS    XY, XY, XY
313        M_STR   XY, pXYBS
314        BCC     LoopY
315        B       ExitLoopY
316
317bSLT4
318        ;//---------bSLT4 Execution---------------
319        M_LDR   pThresholds, ppThresholds
320        CMP     filt, #0
321
322
323        ;// Since beta <= 18 and alpha <= 255 we know
324        ;// -254 <= p0-q0 <= 254
325        ;//  -17 <= q1-q0 <= 17
326        ;//  -17 <= p1-p0 <= 17
327
328        ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
329        ;//
330        ;//    Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
331        ;//                = (4*q0 - 4*p0 + p1 - q1 + 4)>>3
332        ;//                = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3
333
334        USUB8   t1, p_1, p_0
335        USUB8   t2, q_1, q_0
336        BEQ     NoFilterFilt0
337
338        LDRB    tC0, [pThresholds], #1
339        SSUB8   t1, t1, t2
340        LDRB    tC1, [pThresholds], #3
341        M_STR   pThresholds, ppThresholds
342        UHSUB8  t4, p_0, q_0
343        ORR     tC, tC1, tC0, LSL #16
344        USUB8   t5, p_0, q_0
345        AND     t5, t5, m01
346        SHSUB8  t1, t1, t5
347        ORR     tC, tC, LSL #8
348        SSUB8   t1, t1, t5
349        SHSUB8  t1, t1, t4
350        UQADD8  tC, tC, m01
351        SADD8   t1, t1, m01
352        USUB8   t5, filt, m01
353        SHSUB8  t1, t1, t4
354        SEL     tC, tC, m00
355
356        ;// Split into positive and negative part and clip
357
358        SSUB8   t1, t1, m00
359        SEL     pos, t1, m00
360        USUB8   neg, pos, t1
361        USUB8   t3, pos, tC
362        SEL     pos, tC, pos
363        USUB8   t3, neg, tC
364        SEL     neg, tC, neg
365        UQADD8  P_0, p_0, pos
366        UQSUB8  Q_0, q_0, pos
367        UQSUB8  P_0, P_0, neg
368        UQADD8  Q_0, Q_0, neg
369
370        ;// Choose to store the filtered
371        ;// value or the original pixel
372        USUB8   t1, filt, m01
373        SEL     P_0, P_0, p_0
374        SEL     Q_0, Q_0, q_0
375
376StoreResultAndExit
377
378        ;//---------Store result---------------
379
380        ;// P_0 = [r0p0 r1p0 r2p0 r3p0]
381        ;// Q_0 = [r0q0 r1q0 r2q0 r3q0]
382
383        SUB     pQ0, pQ0, srcdstStep, LSL #1
384        ADD        pQ0, pQ0, #1
385
386        MOV     t1, Q_0, LSR #24
387        STRB    t1, [pQ0, #1]
388        MOV     t1, P_0, LSR #24
389        M_STRB  t1, [pQ0], srcdstStep
390
391        MOV     t1, Q_0, LSR #16
392        STRB    t1, [pQ0, #1]
393        MOV     t1, P_0, LSR #16
394        M_STRB  t1, [pQ0], srcdstStep
395
396        MOV     t1, P_0, LSR #8
397        STRB    t1, [pQ0]
398        STRB    P_0, [pQ0, srcdstStep]
399        MOV     t1, Q_0, LSR #8
400        STRB    t1, [pQ0, #1]!
401        STRB    Q_0, [pQ0, srcdstStep]
402
403        M_LDRD  XY, pBS, pXYBS
404        M_LDRD  alpha, beta, pAlphaBeta1
405
406        SUB     pQ0, pQ0, srcdstStep, LSL #1
407        ADD     pQ0, pQ0, #4
408
409        ADDS    XY, XY, XY
410        M_STR   XY, pXYBS
411        BCC     LoopX
412
413;//-------- Common Exit of LoopY -----------------
414        ;// Align the pointers
415
416ExitLoopY
417
418        M_LDR   pThresholds, ppThresholds
419        SUB     pQ0, pQ0, #8
420        ADD     pQ0, pQ0, srcdstStep, LSL #2
421        SUB     pBS, pBS, #14
422        SUB     pThresholds, pThresholds, #6
423        M_STR   pThresholds, ppThresholds
424
425        M_LDRD  alpha, beta, pAlphaBeta0
426
427        BNE     LoopY
428        MOV     r0, #OMX_Sts_NoErr
429;//-----------------End Filter--------------------
430
431        M_END
432
433        ENDIF
434
435        END
436
437
438