omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   9641
21;// Date:       Thursday, February 7, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS ARM1136JS
31
32
33        IF ARM1136JS
34
35
36MASK_0      EQU 0x00000000
37MASK_1      EQU 0x01010101
38MASK_2      EQU 0x0000ff00
39LOOP_COUNT  EQU 0x50000000
40
41;// Declare input registers
42
43pSrcDst     RN 0
44srcdstStep  RN 1
45pAlphaArg   RN 2
46pBetaArg    RN 3
47
48pThresholds RN 6
49pBS         RN 9
50pQ0         RN 0
51bS          RN 2
52bSTemp      RN 10
53
54alpha       RN 6
55alpha0      RN 6
56alpha1      RN 8
57
58beta        RN 7
59beta0       RN 7
60beta1       RN 9
61
62;// Declare Local/Temporary variables
63
64;// Pixels
65p_0         RN 3
66p_1         RN 5
67q_0         RN 8
68q_1         RN 9
69
70;// Unpacking
71mask        RN 11
72
73row0        RN 2
74row1        RN 4
75row2        RN 5
76row3        RN 3
77
78row4        RN 8
79row5        RN 9
80row6        RN 10
81row7        RN 12
82
83tunpk0      RN 2
84tunpk2      RN 10
85tunpk3      RN 12
86
87tunpk4      RN 4
88tunpk5      RN 5
89tunpk6      RN 14
90tunpk7      RN 2
91
92;// Filtering
93
94dp0q0       RN 12
95dp1p0       RN 12
96dq1q0       RN 12
97
98ap0q0       RN 4
99filt        RN 2
100
101m00         RN 14
102m01         RN 11
103
104pQ0         RN 0
105Step        RN 1
106
107;// Output
108
109P_0         RN 6
110Q_0         RN 7
111
112;//Declarations for bSLT4 kernel
113
114tC          RN 12
115tC0         RN 5
116tC1         RN 12
117pos         RN 5
118neg         RN 9
119
120;//Declarations for bSGE4 kernel
121
122
123;// Miscellanous
124XY          RN 8
125
126a           RN 10
127t1          RN 10
128t2          RN 12
129t3          RN 14
130t4          RN 6
131t5          RN 5
132
133
134        ;// Allocate stack memory
135        M_ALLOC4 ppThresholds,4
136        M_ALLOC8 pAlphaBeta0,8
137        M_ALLOC8 pAlphaBeta1,8
138        M_ALLOC8 pXYBS,4
139        M_ALLOC4 ppBS,4
140
141        ;// Function header
142        M_START omxVCM4P10_FilterDeblockingChroma_VerEdge_I, r11
143
144        ;//Input arguments on the stack
145        M_ARG   ppThresholdsArg, 4
146        M_ARG   ppBSArg, 4
147
148        LDRB    alpha1, [pAlphaArg,#1]
149        LDRB    beta1,  [pBetaArg,#1]
150        M_LDR   pThresholds, ppThresholdsArg
151        LDR     a,=MASK_1
152        LDRB    beta0,  [pBetaArg]
153        M_STR   pThresholds, ppThresholds
154        LDRB    alpha0, [pAlphaArg]
155
156        MUL     alpha1, alpha1, a
157        MUL     beta1, beta1, a
158        MUL     alpha0, alpha0, a
159        MUL     beta0, beta0, a
160
161        M_STRD  alpha1, beta1, pAlphaBeta1
162        M_LDR   pBS, ppBSArg
163        M_STRD  alpha0, beta0, pAlphaBeta0
164
165        LDR     XY,=LOOP_COUNT
166        M_STRD  XY, pBS, pXYBS
167
168
169LoopY
170LoopX
171;//---------------Load Pixels-------------------
172
173;//----------------Pack q0-q1-----------------------
174        LDRH    bS, [pBS], #8
175        LDR     mask, =MASK_2
176
177        M_LDRH  row4, [pQ0], srcdstStep
178        CMP     bS, #0
179        M_STR   pBS, ppBS
180        M_LDRH  row5, [pQ0], srcdstStep
181        BEQ.W   NoFilterBS0
182        LDRH    row6, [pQ0]
183        LDRH    row7, [pQ0, srcdstStep]
184
185        ;// row4 = [0 0 r0q0 r0q1]
186        ;// row5 = [0 0 r1q0 r1q1]
187        ;// row6 = [0 0 r2q0 r2q1]
188        ;// row7 = [0 0 r3q0 r3q1]
189
190        AND     tunpk4, mask, row4
191        AND     tunpk5, mask, row4, LSL#8
192        UXTAB   tunpk4, tunpk4, row5, ROR#8
193        UXTAB   tunpk5, tunpk5, row5
194        AND     tunpk6, mask, row6
195        AND     tunpk7, mask, row6, LSL#8
196        UXTAB   tunpk6, tunpk6, row7, ROR#8
197        UXTAB   tunpk7, tunpk7, row7
198
199        ;// tunpk4 = [0 0 r0q0 r1q0]
200        ;// tunpk5 = [0 0 r0q1 r1q1]
201        ;// tunpk6 = [0 0 r2q0 r3q0]
202        ;// tunpk7 = [0 0 r2q1 r3q1]
203
204        SUB     pQ0, pQ0, srcdstStep, LSL #1
205        SUB     pQ0, pQ0, #2
206
207        PKHBT   q_1, tunpk6, tunpk4, LSL#16
208        PKHBT   q_0, tunpk7, tunpk5, LSL#16
209
210        ;// q_0 = [r0q0 r1q0 r2q0 r3q0]
211        ;// q_1 = [r0q1 r1q1 r2q1 r3q1]
212
213
214;//----------------Pack p0-p1-----------------------
215
216        M_LDRH  row0, [pQ0], srcdstStep
217        M_LDRH  row1, [pQ0], srcdstStep
218        LDRH    row2, [pQ0]
219        LDRH    row3, [pQ0, srcdstStep]
220
221        ;// row0 = [0 0 r0p0 r0p1]
222        ;// row1 = [0 0 r1p0 r1p1]
223        ;// row2 = [0 0 r2p0 r2p1]
224        ;// row3 = [0 0 r3p0 r3p1]
225
226        AND     tunpk2, mask, row0
227        AND     tunpk6, mask, row0, LSL#8
228        UXTAB   tunpk2, tunpk2, row1, ROR#8
229        UXTAB   tunpk6, tunpk6, row1
230
231        AND     tunpk0, mask, row2
232        AND     tunpk3, mask, row2, LSL#8
233        UXTAB   tunpk0, tunpk0, row3, ROR#8
234        UXTAB   tunpk3, tunpk3, row3
235
236        ;// tunpk2 = [0 0 r0p0 r1p0]
237        ;// tunpk6 = [0 0 r0p1 r1p1]
238        ;// tunpk0 = [0 0 r2p0 r3p0]
239        ;// tunpk3 = [0 0 r2p1 r3p1]
240
241        PKHBT   p_0, tunpk0, tunpk2, LSL#16
242        M_LDR   bSTemp, ppBS
243        PKHBT   p_1, tunpk3, tunpk6, LSL#16
244
245        ;// p_0 = [r0p0 r1p0 r2p0 r3p0]
246        ;// p_1 = [r0p1 r1p1 r2p1 r3p1]
247
248;//--------------Filtering Decision -------------------
249        USUB8   dp0q0, p_0, q_0
250        LDR     m01, =MASK_1
251        LDRH    bSTemp, [bSTemp ,#-8]
252        MOV     m00, #MASK_0                ;//  00000000 mask
253
254        MOV     filt, m01
255        TST     bSTemp, #0xff00
256        MOVEQ   filt, filt, LSL #16
257        TST     bSTemp, #0xff
258        MOVEQ   filt, filt, LSR #16
259        TST     bSTemp, #4
260
261        ;// Check |p0-q0|<Alpha
262        USUB8   a, q_0, p_0
263        SEL     ap0q0, a, dp0q0
264        USUB8   a, ap0q0, alpha
265        SEL     filt, m00, filt
266
267        ;// Check |p1-p0|<Beta
268        USUB8   dp1p0, p_1, p_0
269        USUB8   a, p_0, p_1
270        SEL     a, a, dp1p0
271        USUB8   a, a, beta
272        SEL     filt, m00, filt
273
274        ;// Check |q1-q0|<Beta
275        USUB8   dq1q0, q_1, q_0
276        USUB8   a, q_0, q_1
277        SEL     a, a, dq1q0
278        USUB8   a, a, beta
279        SEL     filt, m00, filt
280
281        BEQ     bSLT4
282;//-------------------Filter--------------------
283bSGE4
284        ;//---------bSGE4 Execution---------------
285        CMP     filt, #0
286
287        M_LDR   pThresholds, ppThresholds
288
289        ;// Compute P0b
290        UHADD8  t1, p_0, q_1
291        BEQ     NoFilterFilt0
292        MVN     t2, p_1
293        UHSUB8  t1, t1, t2
294        USUB8   t2, filt, m01
295        EOR     t1, t1, m01, LSL #7
296
297        ADD     pThresholds,pThresholds, #4
298
299        ;// Compute Q0b
300        UHADD8  t2, q_0, p_1
301        MVN     t3, q_1
302        UHSUB8  t2, t2, t3
303        M_STR   pThresholds, ppThresholds
304        SEL     P_0, t1, p_0
305        EOR     t2, t2, m01, LSL #7
306        SEL     Q_0, t2, q_0
307
308        B       StoreResultAndExit
309
310;//---------- Exit of LoopX --------------
311;//---- for the case of no filtering -----
312
313NoFilterFilt0
314        ADD     pQ0, pQ0, #2
315NoFilterBS0
316        M_LDR   pThresholds, ppThresholds
317        SUB     pQ0, pQ0, srcdstStep, LSL #1
318        ADD     pQ0, pQ0, #4
319        ADD     pThresholds, pThresholds, #4
320        ;// Load counter for LoopX
321        M_LDRD  XY, pBS, pXYBS
322        M_STR   pThresholds, ppThresholds
323        M_LDRD  alpha, beta, pAlphaBeta1
324
325        ;// Align the pointer
326        ADDS    XY, XY, XY
327        M_STR   XY, pXYBS
328        BCC     LoopY
329        B       ExitLoopY
330
331bSLT4
332        ;//---------bSLT4 Execution---------------
333        M_LDR   pThresholds, ppThresholds
334        CMP     filt, #0
335
336
337        ;// Since beta <= 18 and alpha <= 255 we know
338        ;// -254 <= p0-q0 <= 254
339        ;//  -17 <= q1-q0 <= 17
340        ;//  -17 <= p1-p0 <= 17
341
342        ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
343        ;//
344        ;//    Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
345        ;//                = (4*q0 - 4*p0 + p1 - q1 + 4)>>3
346        ;//                = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3
347
348        USUB8   t1, p_1, p_0
349        USUB8   t2, q_1, q_0
350        BEQ     NoFilterFilt0
351
352        LDRB    tC0, [pThresholds], #1
353        SSUB8   t1, t1, t2
354        LDRB    tC1, [pThresholds], #3
355        M_STR   pThresholds, ppThresholds
356        UHSUB8  t4, p_0, q_0
357        ORR     tC, tC1, tC0, LSL #16
358        USUB8   t5, p_0, q_0
359        AND     t5, t5, m01
360        SHSUB8  t1, t1, t5
361        ORR     tC, tC, LSL #8
362        SSUB8   t1, t1, t5
363        SHSUB8  t1, t1, t4
364        UQADD8  tC, tC, m01
365        SADD8   t1, t1, m01
366        USUB8   t5, filt, m01
367        SHSUB8  t1, t1, t4
368        SEL     tC, tC, m00
369
370        ;// Split into positive and negative part and clip
371
372        SSUB8   t1, t1, m00
373        SEL     pos, t1, m00
374        USUB8   neg, pos, t1
375        USUB8   t3, pos, tC
376        SEL     pos, tC, pos
377        USUB8   t3, neg, tC
378        SEL     neg, tC, neg
379        UQADD8  P_0, p_0, pos
380        UQSUB8  Q_0, q_0, pos
381        UQSUB8  P_0, P_0, neg
382        UQADD8  Q_0, Q_0, neg
383
384        ;// Choose to store the filtered
385        ;// value or the original pixel
386        USUB8   t1, filt, m01
387        SEL     P_0, P_0, p_0
388        SEL     Q_0, Q_0, q_0
389
390StoreResultAndExit
391
392        ;//---------Store result---------------
393
394        ;// P_0 = [r0p0 r1p0 r2p0 r3p0]
395        ;// Q_0 = [r0q0 r1q0 r2q0 r3q0]
396
397        SUB     pQ0, pQ0, srcdstStep, LSL #1
398        ADD        pQ0, pQ0, #1
399
400        MOV     t1, Q_0, LSR #24
401        STRB    t1, [pQ0, #1]
402        MOV     t1, P_0, LSR #24
403        M_STRB  t1, [pQ0], srcdstStep
404
405        MOV     t1, Q_0, LSR #16
406        STRB    t1, [pQ0, #1]
407        MOV     t1, P_0, LSR #16
408        M_STRB  t1, [pQ0], srcdstStep
409
410        MOV     t1, P_0, LSR #8
411        STRB    t1, [pQ0]
412        STRB    P_0, [pQ0, srcdstStep]
413        MOV     t1, Q_0, LSR #8
414        STRB    t1, [pQ0, #1]!
415        STRB    Q_0, [pQ0, srcdstStep]
416
417        M_LDRD  XY, pBS, pXYBS
418        M_LDRD  alpha, beta, pAlphaBeta1
419
420        SUB     pQ0, pQ0, srcdstStep, LSL #1
421        ADD     pQ0, pQ0, #4
422
423        ADDS    XY, XY, XY
424        M_STR   XY, pXYBS
425        BCC     LoopX
426
427;//-------- Common Exit of LoopY -----------------
428        ;// Align the pointers
429
430ExitLoopY
431
432        M_LDR   pThresholds, ppThresholds
433        SUB     pQ0, pQ0, #8
434        ADD     pQ0, pQ0, srcdstStep, LSL #2
435        SUB     pBS, pBS, #14
436        SUB     pThresholds, pThresholds, #6
437        M_STR   pThresholds, ppThresholds
438
439        M_LDRD  alpha, beta, pAlphaBeta0
440
441        BNE     LoopY
442        MOV     r0, #OMX_Sts_NoErr
443;//-----------------End Filter--------------------
444
445        M_END
446
447        ENDIF
448
449        END
450
451
452