omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS CortexA8
31
32        IMPORT  armVCM4P10_DeblockingLumabSLT4_unsafe
33        IMPORT  armVCM4P10_DeblockingLumabSGE4_unsafe
34
35        IF CortexA8
36
37LOOP_COUNT  EQU 0x55000000
38
39
40;// Function arguments
41
42pSrcDst     RN 0
43srcdstStep  RN 1
44pAlpha      RN 2
45pBeta       RN 3
46
47pThresholds RN 5
48pBS         RN 4
49bS10        RN 12
50
51pAlpha_0    RN 2
52pBeta_0     RN 3
53
54pAlpha_1    RN 7
55pBeta_1     RN 8
56
57
58
59;// Loop
60
61XY          RN 9
62
63pTmp        RN 6
64step        RN 10
65
66;// Pixels
67dP_0        DN D4.U8
68dP_1        DN D5.U8
69dP_2        DN D6.U8
70dP_3        DN D7.U8
71dQ_0        DN D8.U8
72dQ_1        DN D9.U8
73dQ_2        DN D10.U8
74dQ_3        DN D11.U8
75
76
77;// Filtering Decision
78dAlpha      DN D0.U8
79dBeta       DN D2.U8
80
81dFilt       DN D16.U8
82dAqflg      DN D12.U8
83dApflg      DN D17.U8
84
85dAp0q0      DN D13.U8
86dAp1p0      DN D12.U8
87dAq1q0      DN D18.U8
88dAp2p0      DN D19.U8
89dAq2q0      DN D17.U8
90
91;// bSLT4
92dTC0        DN D18.U8
93dTC1        DN D19.U8
94dTC01       DN D18.U8
95
96dTCs        DN D31.S8
97dTC         DN D31.U8
98
99dMask_0     DN D14.U8
100dMask_1     DN D15.U8
101
102Mask_0      RN 11
103
104dTemp       DN D19.U8
105
106;// Computing P0,Q0
107qDq0p0      QN Q10.S16
108qDp1q1      QN Q11.S16
109qDelta      QN Q10.S16  ; reuse qDq0p0
110dDelta      DN D20.S8
111
112
113;// Computing P1,Q1
114dRp0q0      DN D24.U8
115
116dMaxP       DN D23.U8
117dMinP       DN D22.U8
118
119dMaxQ       DN D19.U8
120dMinQ       DN D21.U8
121
122dDeltaP     DN D26.U8
123dDeltaQ     DN D27.U8
124
125qP_0n       QN Q14.S16
126qQ_0n       QN Q12.S16
127
128dQ_0n       DN D24.U8
129dQ_1n       DN D25.U8
130dP_0n       DN D29.U8
131dP_1n       DN D30.U8
132
133;// bSGE4
134
135qSp0q0      QN Q10.U16
136
137qSp2q1      QN Q11.U16
138qSp0q0p1    QN Q12.U16
139qSp3p2      QN Q13.U16
140dHSp0q1     DN D28.U8
141
142qSq2p1      QN Q11.U16
143qSp0q0q1    QN Q12.U16
144qSq3q2      QN Q13.U16  ;!!
145dHSq0p1     DN D28.U8   ;!!
146
147qTemp1      QN Q11.U16  ;!!;qSp2q1
148qTemp2      QN Q12.U16  ;!!;qSp0q0p1
149
150dP_0t       DN D28.U8   ;!!;dHSp0q1
151dQ_0t       DN D22.U8   ;!!;Temp1
152
153dP_0n       DN D29.U8
154dP_1n       DN D30.U8
155dP_2n       DN D31.U8
156
157dQ_0n       DN D24.U8   ;!!;Temp2
158dQ_1n       DN D25.U8   ;!!;Temp2
159dQ_2n       DN D28.U8   ;!!;dQ_0t
160
161
162        ;// Function header
163        M_START omxVCM4P10_FilterDeblockingLuma_HorEdge_I, r11, d15
164
165        ;//Arguments on the stack
166        M_ARG   ppThresholds, 4
167        M_ARG   ppBS, 4
168
169        ;// d0-dAlpha_0
170        ;// d2-dBeta_0
171
172        ADD         pAlpha_1, pAlpha_0, #1
173        ADD         pBeta_1, pBeta_0, #1
174
175        VLD1        {dAlpha[]}, [pAlpha_0]
176        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #2
177        VLD1        {dBeta[]}, [pBeta_0]
178
179        M_LDR       pBS, ppBS
180        M_LDR       pThresholds, ppThresholds
181
182        MOV         Mask_0,#0
183
184        ;dMask_0-14
185        ;dMask_1-15
186
187        VMOV        dMask_0, #0
188        VMOV        dMask_1, #1
189
190        ADD         step, srcdstStep, srcdstStep
191
192        LDR         XY,=LOOP_COUNT
193
194        ;// p0-p3 - d4-d7
195        ;// q0-q3 - d8-d11
196LoopY
197LoopX
198        LDRH        bS10, [pBS], #2
199        ADD         pTmp, pSrcDst, srcdstStep
200        CMP         bS10, #0
201        BEQ         NoFilterBS0
202
203        VLD1        dP_3, [pSrcDst], step
204        VLD1        dP_2, [pTmp], step
205        VLD1        dP_1, [pSrcDst], step
206        VLD1        dP_0, [pTmp], step
207        VLD1        dQ_0, [pSrcDst], step
208        VABD        dAp1p0, dP_0, dP_1
209        VLD1        dQ_1, [pTmp]
210        VABD        dAp0q0, dQ_0, dP_0
211        VLD1        dQ_2, [pSrcDst], srcdstStep
212
213        VABD        dAq1q0, dQ_1, dQ_0
214        VABD        dAp2p0, dP_2, dP_0
215        VCGT        dFilt, dAlpha, dAp0q0
216
217        TST         bS10, #0xff
218        VMAX        dAp1p0, dAq1q0, dAp1p0
219        VABD        dAq2q0, dQ_2, dQ_0
220
221        VMOVEQ.U32  dFilt[0], Mask_0
222        TST         bS10, #0xff00
223
224        VCGT        dAp2p0, dBeta, dAp2p0
225        VCGT        dAp1p0, dBeta, dAp1p0
226
227        VMOVEQ.U32  dFilt[1], Mask_0
228
229        VCGT        dAq2q0, dBeta, dAq2q0
230        VLD1        dQ_3, [pSrcDst]
231        VAND        dFilt, dFilt, dAp1p0
232        TST         bS10, #4
233
234        VAND        dAqflg, dFilt, dAq2q0
235        VAND        dApflg, dFilt, dAp2p0
236
237        BNE         bSGE4
238bSLT4
239        ;// bS < 4 Filtering
240        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #2
241        SUB         pSrcDst, pSrcDst, srcdstStep
242
243        BL          armVCM4P10_DeblockingLumabSLT4_unsafe
244
245        ;// Result Storage
246        VST1        dP_1n, [pSrcDst], srcdstStep
247        VST1        dP_0n, [pSrcDst], srcdstStep
248        SUB         pTmp, pSrcDst, srcdstStep, LSL #2
249        VST1        dQ_0n, [pSrcDst], srcdstStep
250        ADDS        XY, XY, XY
251        VST1        dQ_1n, [pSrcDst]
252        ADD         pSrcDst, pTmp, #8
253
254        BCC         LoopX
255        B           ExitLoopY
256
257NoFilterBS0
258        ADD         pSrcDst, pSrcDst, #8
259        ADDS        XY, XY, XY
260        ADD         pThresholds, pThresholds, #2
261        BCC         LoopX
262        B           ExitLoopY
263bSGE4
264        ;// bS >= 4 Filtering
265        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #2
266        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #1
267        BL          armVCM4P10_DeblockingLumabSGE4_unsafe
268
269        ;// Result Storage
270        VST1        dP_2n, [pSrcDst], srcdstStep
271        VST1        dP_1n, [pSrcDst], srcdstStep
272        VST1        dP_0n, [pSrcDst], srcdstStep
273        SUB         pTmp, pSrcDst, srcdstStep, LSL #2
274        VST1        dQ_0n, [pSrcDst], srcdstStep
275        ADDS        XY,XY,XY
276        VST1        dQ_1n, [pSrcDst], srcdstStep
277        ADD         pThresholds, pThresholds, #2
278        VST1        dQ_2n, [pSrcDst]
279
280        ADD         pSrcDst, pTmp, #8
281        BCC         LoopX
282
283ExitLoopY
284
285        SUB         pSrcDst, pSrcDst, #16
286        VLD1        {dAlpha[]}, [pAlpha_1]
287        ADD         pSrcDst, pSrcDst, srcdstStep, LSL #2
288        VLD1        {dBeta[]}, [pBeta_1]
289        BNE         LoopY
290
291        MOV         r0, #OMX_Sts_NoErr
292
293        M_END
294
295    ENDIF
296
297
298
299
300        END
301
302
303