1;//
2;//
3;// File Name:  omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13        INCLUDE omxtypes_s.h
14        INCLUDE armCOMM_s.h
15
16        M_VARIANTS CortexA8
17
18        IMPORT  armVCM4P10_DeblockingLumabSLT4_unsafe
19        IMPORT  armVCM4P10_DeblockingLumabSGE4_unsafe
20
21        IF CortexA8
22
23LOOP_COUNT  EQU 0x11000000
24
25
26;// Function arguments
27
28pSrcDst     RN 0
29srcdstStep  RN 1
30pAlpha      RN 2
31pBeta       RN 3
32
33pThresholds RN 5
34pBS         RN 4
35bS10        RN 12
36
37pAlpha_0    RN 2
38pBeta_0     RN 3
39
40pAlpha_1    RN 7
41pBeta_1     RN 8
42
43pTmp        RN 10
44pTmpStep    RN 11
45
46;// Loop
47
48XY          RN 9
49
50;// Rows input
51dRow0       DN D7.U8
52dRow1       DN D8.U8
53dRow2       DN D5.U8
54dRow3       DN D10.U8
55dRow4       DN D6.U8
56dRow5       DN D9.U8
57dRow6       DN D4.U8
58dRow7       DN D11.U8
59
60;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
61;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
62
63;// Rows output
64dRown0      DN D7.U8
65dRown1      DN D24.U8
66dRown2      DN D30.U8
67dRown3      DN D10.U8
68dRown4      DN D6.U8
69dRown5      DN D25.U8
70dRown6      DN D29.U8
71dRown7      DN D11.U8
72
73;// dP_0n       DN D29.U8
74;// dP_1n       DN D30.U8
75;// dP_2n       DN D31.U8
76;//
77;// dQ_0n       DN D24.U8   ;!!;Temp2
78;// dQ_1n       DN D25.U8   ;!!;Temp2
79;// dQ_2n       DN D28.U8   ;!!;dQ_0t
80;//
81;// dRown0 - dP_3,  dRown1 - dQ_0n
82;// dRown2 - dP_1n, dRown3 - dQ_2
83;// dRown4 - dP_2,  dRown5 - dQ_1n
84;// dRown6 - dP_0n, dRown7 - dQ_3
85
86dRow0n      DN D7.U8
87dRow1n      DN D24.U8
88dRow2n      DN D30.U8
89dRow3n      DN D28.U8
90dRow4n      DN D31.U8
91dRow5n      DN D25.U8
92dRow6n      DN D29.U8
93dRow7n      DN D11.U8
94
95;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
96;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
97
98;// Pixels
99dP_0        DN D4.U8
100dP_1        DN D5.U8
101dP_2        DN D6.U8
102dP_3        DN D7.U8
103dQ_0        DN D8.U8
104dQ_1        DN D9.U8
105dQ_2        DN D10.U8
106dQ_3        DN D11.U8
107
108
109;// Filtering Decision
110dAlpha      DN D0.U8
111dBeta       DN D2.U8
112
113dFilt       DN D16.U8
114dAqflg      DN D12.U8
115dApflg      DN D17.U8
116
117dAp0q0      DN D13.U8
118dAp1p0      DN D12.U8
119dAq1q0      DN D18.U8
120dAp2p0      DN D19.U8
121dAq2q0      DN D17.U8
122
123;// bSLT4
124dTC0        DN D18.U8
125dTC1        DN D19.U8
126dTC01       DN D18.U8
127
128dTCs        DN D31.S8
129dTC         DN D31.U8
130
131dMask_0     DN D14.U8
132dMask_1     DN D15.U8
133
134Mask_0      RN 6
135
136dTemp       DN D19.U8
137
138;// Computing P0,Q0
139qDq0p0      QN Q10.S16
140qDp1q1      QN Q11.S16
141qDelta      QN Q10.S16  ; reuse qDq0p0
142dDelta      DN D20.S8
143
144
145;// Computing P1,Q1
146dRp0q0      DN D24.U8
147
148dMaxP       DN D23.U8
149dMinP       DN D22.U8
150
151dMaxQ       DN D19.U8
152dMinQ       DN D21.U8
153
154dDeltaP     DN D26.U8
155dDeltaQ     DN D27.U8
156
157qP_0n       QN Q14.S16
158qQ_0n       QN Q12.S16
159
160dQ_0n       DN D24.U8
161dQ_1n       DN D25.U8
162dP_0n       DN D29.U8
163dP_1n       DN D30.U8
164
165;// bSGE4
166
167qSp0q0      QN Q10.U16
168
169qSp2q1      QN Q11.U16
170qSp0q0p1    QN Q12.U16
171qSp3p2      QN Q13.U16
172dHSp0q1     DN D28.U8
173
174qSq2p1      QN Q11.U16
175qSp0q0q1    QN Q12.U16
176qSq3q2      QN Q13.U16  ;!!
177dHSq0p1     DN D28.U8   ;!!
178
179qTemp1      QN Q11.U16  ;!!;qSp2q1
180qTemp2      QN Q12.U16  ;!!;qSp0q0p1
181
182dP_0t       DN D28.U8   ;!!;dHSp0q1
183dQ_0t       DN D22.U8   ;!!;Temp1
184
185dP_0n       DN D29.U8
186dP_1n       DN D30.U8
187dP_2n       DN D31.U8
188
189dQ_0n       DN D24.U8   ;!!;Temp2
190dQ_1n       DN D25.U8   ;!!;Temp2
191dQ_2n       DN D28.U8   ;!!;dQ_0t
192
193
194        ;// Function header
195        M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11, d15
196
197        ;//Arguments on the stack
198        M_ARG   ppThresholds, 4
199        M_ARG   ppBS, 4
200
201        ;// d0-dAlpha_0
202        ;// d2-dBeta_0
203
204        ADD         pAlpha_1, pAlpha_0, #1
205        ADD         pBeta_1, pBeta_0, #1
206
207        VLD1        {dAlpha[]}, [pAlpha_0]
208        SUB         pSrcDst, pSrcDst, #4
209        VLD1        {dBeta[]}, [pBeta_0]
210
211        M_LDR       pBS, ppBS
212        M_LDR       pThresholds, ppThresholds
213
214        MOV         Mask_0,#0
215
216        ;dMask_0-14
217        ;dMask_1-15
218
219        VMOV        dMask_0, #0
220        VMOV        dMask_1, #1
221
222        LDR         XY,=LOOP_COUNT
223
224        ADD         pTmpStep, srcdstStep, srcdstStep
225
226        ;// p0-p3 - d4-d7
227        ;// q0-q3 - d8-d11
228LoopY
229LoopX
230        LDRH        bS10, [pBS], #4
231
232        CMP         bS10, #0
233        BEQ         NoFilterBS0
234
235        ;// Load 8 rows of data
236        ADD         pTmp, pSrcDst, srcdstStep
237        VLD1        dRow0, [pSrcDst], pTmpStep
238        VLD1        dRow1, [pTmp], pTmpStep
239        VLD1        dRow2, [pSrcDst], pTmpStep
240        VZIP.8      dRow0, dRow1
241        VLD1        dRow3, [pTmp], pTmpStep
242        VLD1        dRow4, [pSrcDst], pTmpStep
243        VZIP.8      dRow2, dRow3
244        VLD1        dRow5, [pTmp], pTmpStep
245        VLD1        dRow6, [pSrcDst], pTmpStep
246        VLD1        dRow7, [pTmp], pTmpStep
247        VZIP.8      dRow4, dRow5
248        VZIP.16     dRow1, dRow3
249
250
251        ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
252        ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
253        ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
254        ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
255        ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
256        ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
257        ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
258        ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
259
260        ;// 8x8 Transpose
261
262        VZIP.8      dRow6, dRow7
263
264        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
265        VZIP.16     dRow0, dRow2
266        VZIP.16     dRow5, dRow7
267
268
269        VZIP.16     dRow4, dRow6
270        VZIP.32     dRow1, dRow5
271        VZIP.32     dRow2, dRow6
272        VZIP.32     dRow3, dRow7
273        VZIP.32     dRow0, dRow4
274
275
276        ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
277        ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
278
279        ;// dQ_0 = [q0r7 q0r6 q0r5 q0r4 q0r3 q0r2 q0r1 q0r0]
280        ;// dQ_1 = [q1r7 q1r6 q1r5 q1r4 q1r3 q1r2 q1r1 q1r0]
281        ;// dQ_2 = [q2r7 q2r6 q2r5 q2r4 q2r3 q2r2 q2r1 q2r0]
282        ;// dQ_3 = [q3r7 q3r6 q3r5 q3r4 q3r3 q3r2 q3r1 q3r0]
283
284        ;// dP_0 = [p0r7 p0r6 p0r5 p0r4 p0r3 p0r2 p0r1 p0r0]
285        ;// dP_1 = [p1r7 p1r6 p1r5 p1r4 p1r3 p1r2 p1r1 p1r0]
286        ;// dP_2 = [p2r7 p2r6 p2r5 p2r4 p2r3 p2r2 p2r1 p2r0]
287        ;// dP_3 = [p3r7 p3r6 p3r5 p3r4 p3r3 p3r2 p3r1 p3r0]
288
289        VABD        dAp0q0, dP_0, dQ_0
290        VABD        dAp1p0, dP_1, dP_0
291
292        VABD        dAq1q0, dQ_1, dQ_0
293        VABD        dAp2p0, dP_2, dP_0
294
295        TST         bS10, #0xff
296        VCGT        dFilt, dAlpha, dAp0q0
297
298        VMAX        dAp1p0, dAq1q0, dAp1p0
299        VABD        dAq2q0, dQ_2, dQ_0
300
301        VMOVEQ.U32  dFilt[0], Mask_0
302        TST         bS10, #0xff00
303
304        VCGT        dAp2p0, dBeta, dAp2p0
305        VCGT        dAp1p0, dBeta, dAp1p0
306
307        VMOVEQ.U32  dFilt[1], Mask_0
308
309        VCGT        dAq2q0, dBeta, dAq2q0
310        VAND        dFilt, dFilt, dAp1p0
311        TST         bS10, #4
312
313        VAND        dAqflg, dFilt, dAq2q0
314        VAND        dApflg, dFilt, dAp2p0
315
316        BNE         bSGE4
317bSLT4
318        ;// bS < 4 Filtering
319
320        BL          armVCM4P10_DeblockingLumabSLT4_unsafe
321
322        ;// Transpose
323
324        VZIP.8      dP_3,  dP_2
325        VZIP.8      dP_1n, dP_0n
326        VZIP.8      dQ_0n, dQ_1n
327        VZIP.8      dQ_2,  dQ_3
328
329
330        VZIP.16     dP_3,  dP_1n
331        ADD         pTmp, pSrcDst, srcdstStep
332        VZIP.16     dQ_0n, dQ_2
333        VZIP.16     dQ_1n, dQ_3
334        VZIP.16     dP_2,  dP_0n
335
336        VZIP.32     dP_3,  dQ_0n
337        VZIP.32     dP_1n, dQ_2
338        VZIP.32     dP_2,  dQ_1n
339        VZIP.32     dP_0n, dQ_3
340
341        ;// dRown0 - dP_3,  dRown1 - dQ_0n
342        ;// dRown2 - dP_1n, dRown3 - dQ_2
343        ;// dRown4 - dP_2,  dRown5 - dQ_1n
344        ;// dRown6 - dP_0n, dRown7 - dQ_3
345
346        VST1        dRown0, [pSrcDst], pTmpStep
347        VST1        dRown1, [pTmp], pTmpStep
348        VST1        dRown2, [pSrcDst], pTmpStep
349        VST1        dRown3, [pTmp], pTmpStep
350        ;1
351        VST1        dRown4, [pSrcDst], pTmpStep
352        VST1        dRown5, [pTmp], pTmpStep
353        ADDS        XY, XY, XY
354        VST1        dRown6, [pSrcDst], pTmpStep
355        ADD         pThresholds, pThresholds, #2
356        VST1        dRown7, [pTmp], srcdstStep
357
358        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
359        VLD1        {dAlpha[]}, [pAlpha_1]
360        ADD         pSrcDst, pSrcDst, #4
361        VLD1        {dBeta[]}, [pBeta_1]
362
363        BCC         LoopX
364        B           ExitLoopY
365
366NoFilterBS0
367        ADD         pSrcDst, pSrcDst, #4
368        ADDS        XY, XY, XY
369        VLD1        {dAlpha[]}, [pAlpha_1]
370        ADD         pThresholds, pThresholds, #4
371        VLD1        {dBeta[]}, [pBeta_1]
372        BCC         LoopX
373        B           ExitLoopY
374bSGE4
375        ;// bS >= 4 Filtering
376
377        BL          armVCM4P10_DeblockingLumabSGE4_unsafe
378
379        ;// Transpose
380
381        VZIP.8      dP_3,  dP_2n
382        VZIP.8      dP_1n, dP_0n
383        VZIP.8      dQ_0n, dQ_1n
384        VZIP.8      dQ_2n, dQ_3
385
386        VZIP.16     dP_3,  dP_1n
387        ADD         pTmp, pSrcDst, srcdstStep
388        VZIP.16     dQ_0n, dQ_2n
389        VZIP.16     dQ_1n, dQ_3
390        VZIP.16     dP_2n, dP_0n
391
392        VZIP.32     dP_3,  dQ_0n
393        VZIP.32     dP_1n, dQ_2n
394        VZIP.32     dP_2n, dQ_1n
395        VZIP.32     dP_0n, dQ_3
396
397        ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
398        ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
399
400        VST1        dRow0n, [pSrcDst], pTmpStep
401        VST1        dRow1n, [pTmp], pTmpStep
402        VST1        dRow2n, [pSrcDst], pTmpStep
403        VST1        dRow3n, [pTmp], pTmpStep
404        VST1        dRow4n, [pSrcDst], pTmpStep
405        VST1        dRow5n, [pTmp], pTmpStep
406        ADDS        XY,XY,XY
407        VST1        dRow6n, [pSrcDst], pTmpStep
408        ADD         pThresholds, pThresholds, #4
409        VST1        dRow7n, [pTmp], pTmpStep
410
411        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
412        VLD1        {dAlpha[]}, [pAlpha_1]
413        ADD         pSrcDst, pSrcDst, #4
414        VLD1        {dBeta[]}, [pBeta_1]
415
416        BCC         LoopX
417
418ExitLoopY
419        SUB         pBS, pBS, #14
420        SUB         pThresholds, pThresholds, #14
421        SUB         pSrcDst, pSrcDst, #16
422        VLD1        {dAlpha[]}, [pAlpha_0]
423        ADD         pSrcDst, pSrcDst, srcdstStep, LSL #3
424        VLD1        {dBeta[]}, [pBeta_0]
425        BNE         LoopY
426
427        MOV         r0, #OMX_Sts_NoErr
428
429        M_END
430
431    ENDIF
432
433
434        END
435
436
437