1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS CortexA8
31
32        IMPORT  armVCM4P10_DeblockingLumabSLT4_unsafe
33        IMPORT  armVCM4P10_DeblockingLumabSGE4_unsafe
34
35        IF CortexA8
36
37LOOP_COUNT  EQU 0x11000000
38
39
40;// Function arguments
41
42pSrcDst     RN 0
43srcdstStep  RN 1
44pAlpha      RN 2
45pBeta       RN 3
46
47pThresholds RN 5
48pBS         RN 4
49bS10        RN 12
50
51pAlpha_0    RN 2
52pBeta_0     RN 3
53
54pAlpha_1    RN 7
55pBeta_1     RN 8
56
57pTmp        RN 10
58pTmpStep    RN 11
59
60;// Loop
61
62XY          RN 9
63
64;// Rows input
65dRow0       DN D7.U8
66dRow1       DN D8.U8
67dRow2       DN D5.U8
68dRow3       DN D10.U8
69dRow4       DN D6.U8
70dRow5       DN D9.U8
71dRow6       DN D4.U8
72dRow7       DN D11.U8
73
74;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
75;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
76
77;// Rows output
78dRown0      DN D7.U8
79dRown1      DN D24.U8
80dRown2      DN D30.U8
81dRown3      DN D10.U8
82dRown4      DN D6.U8
83dRown5      DN D25.U8
84dRown6      DN D29.U8
85dRown7      DN D11.U8
86
87;// dP_0n       DN D29.U8
88;// dP_1n       DN D30.U8
89;// dP_2n       DN D31.U8
90;//
91;// dQ_0n       DN D24.U8   ;!!;Temp2
92;// dQ_1n       DN D25.U8   ;!!;Temp2
93;// dQ_2n       DN D28.U8   ;!!;dQ_0t
94;//
95;// dRown0 - dP_3,  dRown1 - dQ_0n
96;// dRown2 - dP_1n, dRown3 - dQ_2
97;// dRown4 - dP_2,  dRown5 - dQ_1n
98;// dRown6 - dP_0n, dRown7 - dQ_3
99
100dRow0n      DN D7.U8
101dRow1n      DN D24.U8
102dRow2n      DN D30.U8
103dRow3n      DN D28.U8
104dRow4n      DN D31.U8
105dRow5n      DN D25.U8
106dRow6n      DN D29.U8
107dRow7n      DN D11.U8
108
109;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
110;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
111
112;// Pixels
113dP_0        DN D4.U8
114dP_1        DN D5.U8
115dP_2        DN D6.U8
116dP_3        DN D7.U8
117dQ_0        DN D8.U8
118dQ_1        DN D9.U8
119dQ_2        DN D10.U8
120dQ_3        DN D11.U8
121
122
123;// Filtering Decision
124dAlpha      DN D0.U8
125dBeta       DN D2.U8
126
127dFilt       DN D16.U8
128dAqflg      DN D12.U8
129dApflg      DN D17.U8
130
131dAp0q0      DN D13.U8
132dAp1p0      DN D12.U8
133dAq1q0      DN D18.U8
134dAp2p0      DN D19.U8
135dAq2q0      DN D17.U8
136
137;// bSLT4
138dTC0        DN D18.U8
139dTC1        DN D19.U8
140dTC01       DN D18.U8
141
142dTCs        DN D31.S8
143dTC         DN D31.U8
144
145dMask_0     DN D14.U8
146dMask_1     DN D15.U8
147
148Mask_0      RN 6
149
150dTemp       DN D19.U8
151
152;// Computing P0,Q0
153qDq0p0      QN Q10.S16
154qDp1q1      QN Q11.S16
155qDelta      QN Q10.S16  ; reuse qDq0p0
156dDelta      DN D20.S8
157
158
159;// Computing P1,Q1
160dRp0q0      DN D24.U8
161
162dMaxP       DN D23.U8
163dMinP       DN D22.U8
164
165dMaxQ       DN D19.U8
166dMinQ       DN D21.U8
167
168dDeltaP     DN D26.U8
169dDeltaQ     DN D27.U8
170
171qP_0n       QN Q14.S16
172qQ_0n       QN Q12.S16
173
174dQ_0n       DN D24.U8
175dQ_1n       DN D25.U8
176dP_0n       DN D29.U8
177dP_1n       DN D30.U8
178
179;// bSGE4
180
181qSp0q0      QN Q10.U16
182
183qSp2q1      QN Q11.U16
184qSp0q0p1    QN Q12.U16
185qSp3p2      QN Q13.U16
186dHSp0q1     DN D28.U8
187
188qSq2p1      QN Q11.U16
189qSp0q0q1    QN Q12.U16
190qSq3q2      QN Q13.U16  ;!!
191dHSq0p1     DN D28.U8   ;!!
192
193qTemp1      QN Q11.U16  ;!!;qSp2q1
194qTemp2      QN Q12.U16  ;!!;qSp0q0p1
195
196dP_0t       DN D28.U8   ;!!;dHSp0q1
197dQ_0t       DN D22.U8   ;!!;Temp1
198
199dP_0n       DN D29.U8
200dP_1n       DN D30.U8
201dP_2n       DN D31.U8
202
203dQ_0n       DN D24.U8   ;!!;Temp2
204dQ_1n       DN D25.U8   ;!!;Temp2
205dQ_2n       DN D28.U8   ;!!;dQ_0t
206
207
208        ;// Function header
209        M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11, d15
210
211        ;//Arguments on the stack
212        M_ARG   ppThresholds, 4
213        M_ARG   ppBS, 4
214
215        ;// d0-dAlpha_0
216        ;// d2-dBeta_0
217
218        ADD         pAlpha_1, pAlpha_0, #1
219        ADD         pBeta_1, pBeta_0, #1
220
221        VLD1        {dAlpha[]}, [pAlpha_0]
222        SUB         pSrcDst, pSrcDst, #4
223        VLD1        {dBeta[]}, [pBeta_0]
224
225        M_LDR       pBS, ppBS
226        M_LDR       pThresholds, ppThresholds
227
228        MOV         Mask_0,#0
229
230        ;dMask_0-14
231        ;dMask_1-15
232
233        VMOV        dMask_0, #0
234        VMOV        dMask_1, #1
235
236        LDR         XY,=LOOP_COUNT
237
238        ADD         pTmpStep, srcdstStep, srcdstStep
239
240        ;// p0-p3 - d4-d7
241        ;// q0-q3 - d8-d11
242LoopY
243LoopX
244        LDRH        bS10, [pBS], #4
245
246        CMP         bS10, #0
247        BEQ         NoFilterBS0
248
249        ;// Load 8 rows of data
250        ADD         pTmp, pSrcDst, srcdstStep
251        VLD1        dRow0, [pSrcDst], pTmpStep
252        VLD1        dRow1, [pTmp], pTmpStep
253        VLD1        dRow2, [pSrcDst], pTmpStep
254        VZIP.8      dRow0, dRow1
255        VLD1        dRow3, [pTmp], pTmpStep
256        VLD1        dRow4, [pSrcDst], pTmpStep
257        VZIP.8      dRow2, dRow3
258        VLD1        dRow5, [pTmp], pTmpStep
259        VLD1        dRow6, [pSrcDst], pTmpStep
260        VLD1        dRow7, [pTmp], pTmpStep
261        VZIP.8      dRow4, dRow5
262        VZIP.16     dRow1, dRow3
263
264
265        ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
266        ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
267        ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
268        ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
269        ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
270        ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
271        ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
272        ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
273
274        ;// 8x8 Transpose
275
276        VZIP.8      dRow6, dRow7
277
278        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
279        VZIP.16     dRow0, dRow2
280        VZIP.16     dRow5, dRow7
281
282
283        VZIP.16     dRow4, dRow6
284        VZIP.32     dRow1, dRow5
285        VZIP.32     dRow2, dRow6
286        VZIP.32     dRow3, dRow7
287        VZIP.32     dRow0, dRow4
288
289
290        ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
291        ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
292
293        ;// dQ_0 = [q0r7 q0r6 q0r5 q0r4 q0r3 q0r2 q0r1 q0r0]
294        ;// dQ_1 = [q1r7 q1r6 q1r5 q1r4 q1r3 q1r2 q1r1 q1r0]
295        ;// dQ_2 = [q2r7 q2r6 q2r5 q2r4 q2r3 q2r2 q2r1 q2r0]
296        ;// dQ_3 = [q3r7 q3r6 q3r5 q3r4 q3r3 q3r2 q3r1 q3r0]
297
298        ;// dP_0 = [p0r7 p0r6 p0r5 p0r4 p0r3 p0r2 p0r1 p0r0]
299        ;// dP_1 = [p1r7 p1r6 p1r5 p1r4 p1r3 p1r2 p1r1 p1r0]
300        ;// dP_2 = [p2r7 p2r6 p2r5 p2r4 p2r3 p2r2 p2r1 p2r0]
301        ;// dP_3 = [p3r7 p3r6 p3r5 p3r4 p3r3 p3r2 p3r1 p3r0]
302
303        VABD        dAp0q0, dP_0, dQ_0
304        VABD        dAp1p0, dP_1, dP_0
305
306        VABD        dAq1q0, dQ_1, dQ_0
307        VABD        dAp2p0, dP_2, dP_0
308
309        TST         bS10, #0xff
310        VCGT        dFilt, dAlpha, dAp0q0
311
312        VMAX        dAp1p0, dAq1q0, dAp1p0
313        VABD        dAq2q0, dQ_2, dQ_0
314
315        VMOVEQ.U32  dFilt[0], Mask_0
316        TST         bS10, #0xff00
317
318        VCGT        dAp2p0, dBeta, dAp2p0
319        VCGT        dAp1p0, dBeta, dAp1p0
320
321        VMOVEQ.U32  dFilt[1], Mask_0
322
323        VCGT        dAq2q0, dBeta, dAq2q0
324        VAND        dFilt, dFilt, dAp1p0
325        TST         bS10, #4
326
327        VAND        dAqflg, dFilt, dAq2q0
328        VAND        dApflg, dFilt, dAp2p0
329
330        BNE         bSGE4
331bSLT4
332        ;// bS < 4 Filtering
333
334        BL          armVCM4P10_DeblockingLumabSLT4_unsafe
335
336        ;// Transpose
337
338        VZIP.8      dP_3,  dP_2
339        VZIP.8      dP_1n, dP_0n
340        VZIP.8      dQ_0n, dQ_1n
341        VZIP.8      dQ_2,  dQ_3
342
343
344        VZIP.16     dP_3,  dP_1n
345        ADD         pTmp, pSrcDst, srcdstStep
346        VZIP.16     dQ_0n, dQ_2
347        VZIP.16     dQ_1n, dQ_3
348        VZIP.16     dP_2,  dP_0n
349
350        VZIP.32     dP_3,  dQ_0n
351        VZIP.32     dP_1n, dQ_2
352        VZIP.32     dP_2,  dQ_1n
353        VZIP.32     dP_0n, dQ_3
354
355        ;// dRown0 - dP_3,  dRown1 - dQ_0n
356        ;// dRown2 - dP_1n, dRown3 - dQ_2
357        ;// dRown4 - dP_2,  dRown5 - dQ_1n
358        ;// dRown6 - dP_0n, dRown7 - dQ_3
359
360        VST1        dRown0, [pSrcDst], pTmpStep
361        VST1        dRown1, [pTmp], pTmpStep
362        VST1        dRown2, [pSrcDst], pTmpStep
363        VST1        dRown3, [pTmp], pTmpStep
364        ;1
365        VST1        dRown4, [pSrcDst], pTmpStep
366        VST1        dRown5, [pTmp], pTmpStep
367        ADDS        XY, XY, XY
368        VST1        dRown6, [pSrcDst], pTmpStep
369        ADD         pThresholds, pThresholds, #2
370        VST1        dRown7, [pTmp], srcdstStep
371
372        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
373        VLD1        {dAlpha[]}, [pAlpha_1]
374        ADD         pSrcDst, pSrcDst, #4
375        VLD1        {dBeta[]}, [pBeta_1]
376
377        BCC         LoopX
378        B           ExitLoopY
379
380NoFilterBS0
381        ADD         pSrcDst, pSrcDst, #4
382        ADDS        XY, XY, XY
383        VLD1        {dAlpha[]}, [pAlpha_1]
384        ADD         pThresholds, pThresholds, #4
385        VLD1        {dBeta[]}, [pBeta_1]
386        BCC         LoopX
387        B           ExitLoopY
388bSGE4
389        ;// bS >= 4 Filtering
390
391        BL          armVCM4P10_DeblockingLumabSGE4_unsafe
392
393        ;// Transpose
394
395        VZIP.8      dP_3,  dP_2n
396        VZIP.8      dP_1n, dP_0n
397        VZIP.8      dQ_0n, dQ_1n
398        VZIP.8      dQ_2n, dQ_3
399
400        VZIP.16     dP_3,  dP_1n
401        ADD         pTmp, pSrcDst, srcdstStep
402        VZIP.16     dQ_0n, dQ_2n
403        VZIP.16     dQ_1n, dQ_3
404        VZIP.16     dP_2n, dP_0n
405
406        VZIP.32     dP_3,  dQ_0n
407        VZIP.32     dP_1n, dQ_2n
408        VZIP.32     dP_2n, dQ_1n
409        VZIP.32     dP_0n, dQ_3
410
411        ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
412        ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
413
414        VST1        dRow0n, [pSrcDst], pTmpStep
415        VST1        dRow1n, [pTmp], pTmpStep
416        VST1        dRow2n, [pSrcDst], pTmpStep
417        VST1        dRow3n, [pTmp], pTmpStep
418        VST1        dRow4n, [pSrcDst], pTmpStep
419        VST1        dRow5n, [pTmp], pTmpStep
420        ADDS        XY,XY,XY
421        VST1        dRow6n, [pSrcDst], pTmpStep
422        ADD         pThresholds, pThresholds, #4
423        VST1        dRow7n, [pTmp], pTmpStep
424
425        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
426        VLD1        {dAlpha[]}, [pAlpha_1]
427        ADD         pSrcDst, pSrcDst, #4
428        VLD1        {dBeta[]}, [pBeta_1]
429
430        BCC         LoopX
431
432ExitLoopY
433        SUB         pBS, pBS, #14
434        SUB         pThresholds, pThresholds, #14
435        SUB         pSrcDst, pSrcDst, #16
436        VLD1        {dAlpha[]}, [pAlpha_0]
437        ADD         pSrcDst, pSrcDst, srcdstStep, LSL #3
438        VLD1        {dBeta[]}, [pBeta_0]
439        BNE         LoopY
440
441        MOV         r0, #OMX_Sts_NoErr
442
443        M_END
444
445    ENDIF
446
447
448        END
449
450
451