omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;//
3;// File Name:  omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13        INCLUDE omxtypes_s.h
14        INCLUDE armCOMM_s.h
15
16        M_VARIANTS CortexA8
17
18        IF CortexA8
19
20        IMPORT  armVCM4P10_DeblockingChromabSGE4_unsafe
21        IMPORT  armVCM4P10_DeblockingChromabSLT4_unsafe
22
23LOOP_COUNT  EQU 0x40000000
24MASK_3      EQU 0x03030303
25MASK_4      EQU 0x04040404
26
27;// Function arguments
28
29pSrcDst     RN 0
30srcdstStep  RN 1
31pAlpha      RN 2
32pBeta       RN 3
33
34pThresholds RN 5
35pBS         RN 4
36bS3210      RN 6
37pSrcDst_P   RN 10
38pSrcDst_Q   RN 12
39
40pTmp        RN 10
41pTmp2       RN 12
42step        RN 14
43
44;// Loop
45
46XY          RN 7
47
48;// Rows input
49dRow0       DN D7.U8
50dRow1       DN D8.U8
51dRow2       DN D5.U8
52dRow3       DN D10.U8
53dRow4       DN D6.U8
54dRow5       DN D9.U8
55dRow6       DN D4.U8
56dRow7       DN D11.U8
57
58
59;// Pixels
60dP_0        DN D4.U8
61dP_1        DN D5.U8
62dP_2        DN D6.U8
63dQ_0        DN D8.U8
64dQ_1        DN D9.U8
65dQ_2        DN D10.U8
66
67;// Filtering Decision
68dAlpha      DN D0.U8
69dBeta       DN D2.U8
70
71dFilt       DN D16.U8
72dAqflg      DN D12.U8
73dApflg      DN D17.U8
74
75dAp0q0      DN D13.U8
76dAp1p0      DN D12.U8
77dAq1q0      DN D18.U8
78dAp2p0      DN D19.U8
79dAq2q0      DN D17.U8
80
81qBS3210     QN Q13.U16
82dBS3210     DN D26
83dMask_bs    DN D27
84dFilt_bs    DN D26.U16
85
86;// bSLT4
87dMask_0     DN D14.U8
88dMask_1     DN D15.U8
89dMask_4     DN D1.U16
90
91Mask_4      RN 8
92Mask_3      RN 9
93
94dTemp       DN D19.U8
95
96;// Result
97dP_0t       DN D13.U8
98dQ_0t       DN D31.U8
99
100dP_0n       DN D29.U8
101dQ_0n       DN D24.U8
102
103
104        ;// Function header
105        M_START omxVCM4P10_FilterDeblockingChroma_VerEdge_I, r12, d15
106
107        ;//Arguments on the stack
108        M_ARG   ppThresholds, 4
109        M_ARG   ppBS, 4
110
111        ;// d0-dAlpha_0
112        ;// d2-dBeta_0
113
114        ;load alpha1,beta1 somewhere to avoid more loads
115        VLD1        {dAlpha[]}, [pAlpha]!
116        SUB         pSrcDst, pSrcDst, #4
117        VLD1        {dBeta[]}, [pBeta]!
118
119        M_LDR       pBS, ppBS
120        M_LDR       pThresholds, ppThresholds
121
122        LDR         Mask_4, =MASK_4
123        LDR         Mask_3, =MASK_3
124
125        ;dMask_0-14
126        ;dMask_1-15
127        ;dMask_4-19
128
129        VMOV        dMask_0, #0
130        VMOV        dMask_1, #1
131        VMOV        dMask_4, #4
132
133        LDR         XY, =LOOP_COUNT
134
135        ;// p0-p3 - d4-d7
136        ;// q0-q3 - d8-d11
137
138
139LoopY
140        LDR         bS3210, [pBS], #8
141        ADD         pTmp, pSrcDst, srcdstStep
142        ADD         step, srcdstStep, srcdstStep
143
144        ;1
145        VLD1        dRow0, [pSrcDst], step
146        ;1
147        VLD1        dRow1, [pTmp], step
148        VLD1        dRow2, [pSrcDst], step
149        VLD1        dRow3, [pTmp], step
150        VLD1        dRow4, [pSrcDst], step
151        VLD1        dRow5, [pTmp], step
152        VLD1        dRow6, [pSrcDst], step
153        VLD1        dRow7, [pTmp], step
154
155
156        ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
157        ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
158        ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
159        ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
160        ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
161        ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
162        ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
163        ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
164
165        ;// 8x8 Transpose
166        VZIP.8      dRow0, dRow1
167        VZIP.8      dRow2, dRow3
168        VZIP.8      dRow4, dRow5
169        VZIP.8      dRow6, dRow7
170
171        VZIP.16     dRow0, dRow2
172        VZIP.16     dRow1, dRow3
173        VZIP.16     dRow4, dRow6
174        VZIP.16     dRow5, dRow7
175
176        VZIP.32     dRow0, dRow4
177        VZIP.32     dRow2, dRow6
178        VZIP.32     dRow3, dRow7
179        VZIP.32     dRow1, dRow5
180
181
182        ;Realign the pointers
183
184        CMP         bS3210, #0
185        VABD        dAp2p0, dP_2, dP_0
186        VABD        dAp0q0, dP_0, dQ_0
187        BEQ         NoFilterBS0
188
189        VABD        dAp1p0, dP_1, dP_0
190        VABD        dAq1q0, dQ_1, dQ_0
191
192        VMOV.U32    dBS3210[0], bS3210
193        VCGT        dFilt, dAlpha, dAp0q0
194        VMAX        dAp1p0, dAq1q0, dAp1p0
195        VMOVL       qBS3210, dBS3210.U8
196        VABD        dAq2q0, dQ_2, dQ_0
197        VCGT        dMask_bs.S16, dBS3210.S16, #0
198
199        VCGT        dAp1p0, dBeta, dAp1p0
200        VCGT        dAp2p0, dBeta, dAp2p0
201        VAND        dFilt, dMask_bs.U8
202
203        TST         bS3210, Mask_3
204
205        VCGT        dAq2q0, dBeta, dAq2q0
206        VAND        dFilt, dFilt, dAp1p0
207
208        VAND        dAqflg, dFilt, dAq2q0
209        VAND        dApflg, dFilt, dAp2p0
210
211        ;// bS < 4 Filtering
212        BLNE        armVCM4P10_DeblockingChromabSLT4_unsafe
213
214        TST         bS3210, Mask_4
215
216        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
217        VTST        dFilt_bs, dFilt_bs, dMask_4
218
219        ;// bS == 4 Filtering
220        BLNE        armVCM4P10_DeblockingChromabSGE4_unsafe
221
222        VBIT        dP_0n, dP_0t, dFilt_bs
223        VBIT        dQ_0n, dQ_0t, dFilt_bs
224
225        ;// Result Storage
226        ADD         pSrcDst_P, pSrcDst, #3
227        VBIF        dP_0n, dP_0, dFilt
228
229        ADD         pTmp2, pSrcDst_P, srcdstStep
230        ADD         step, srcdstStep, srcdstStep
231        VBIF        dQ_0n, dQ_0, dFilt
232
233        ADDS        XY, XY, XY
234
235        VST1        {dP_0n[0]}, [pSrcDst_P], step
236        VST1        {dP_0n[1]}, [pTmp2], step
237        VST1        {dP_0n[2]}, [pSrcDst_P], step
238        VST1        {dP_0n[3]}, [pTmp2], step
239        VST1        {dP_0n[4]}, [pSrcDst_P], step
240        VST1        {dP_0n[5]}, [pTmp2], step
241        VST1        {dP_0n[6]}, [pSrcDst_P], step
242        VST1        {dP_0n[7]}, [pTmp2], step
243
244        ADD         pSrcDst_Q, pSrcDst, #4
245        ADD         pTmp, pSrcDst_Q, srcdstStep
246
247        VST1        {dQ_0n[0]}, [pSrcDst_Q], step
248        VST1        {dQ_0n[1]}, [pTmp], step
249        VST1        {dQ_0n[2]}, [pSrcDst_Q], step
250        VST1        {dQ_0n[3]}, [pTmp], step
251        VST1        {dQ_0n[4]}, [pSrcDst_Q], step
252        VST1        {dQ_0n[5]}, [pTmp], step
253        VST1        {dQ_0n[6]}, [pSrcDst_Q], step
254        VST1        {dQ_0n[7]}, [pTmp], step
255
256        ADD         pSrcDst, pSrcDst, #4
257
258        BNE         LoopY
259
260        MOV         r0, #OMX_Sts_NoErr
261
262        M_EXIT
263
264NoFilterBS0
265        VLD1        {dAlpha[]}, [pAlpha]
266        ADD         pSrcDst, pSrcDst, #4
267        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
268        ADDS        XY, XY, XY
269        VLD1        {dBeta[]}, [pBeta]
270        ADD         pThresholds, pThresholds, #4
271        BNE         LoopY
272
273        MOV         r0, #OMX_Sts_NoErr
274
275        M_END
276
277        ENDIF
278
279
280        END
281
282
283