omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;//
3;// File Name:  omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   9641
6;// Date:       Thursday, February 7, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13        INCLUDE omxtypes_s.h
14        INCLUDE armCOMM_s.h
15
16        M_VARIANTS ARM1136JS
17
18        IMPORT  armVCM4P10_DeblockingLumabSLT4_unsafe
19        IMPORT  armVCM4P10_DeblockingLumabSGE4_unsafe
20
21
22    IF ARM1136JS
23
24MASK_0      EQU 0x00000000
25MASK_1      EQU 0x01010101
26MASK_2      EQU 0xff00ff00
27LOOP_COUNT  EQU 0x11110000
28
29;// Declare input registers
30
31pSrcDst     RN 0
32srcdstStep  RN 1
33pAlphaArg   RN 2
34pBetaArg    RN 3
35
36pThresholds RN 14
37pBS         RN 9
38pQ0         RN 0
39bS          RN 2
40
41alpha       RN 6
42alpha0      RN 6
43alpha1      RN 8
44
45beta        RN 7
46beta0       RN 7
47beta1       RN 9
48
49;// Declare Local/Temporary variables
50
51;// Pixels
52p_0         RN 3
53p_1         RN 5
54p_2         RN 4
55p_3         RN 2
56q_0         RN 8
57q_1         RN 9
58q_2         RN 10
59q_3         RN 12
60
61;// Unpacking
62mask        RN 11
63
64row0        RN 2
65row1        RN 4
66row2        RN 5
67row3        RN 3
68
69row4        RN 8
70row5        RN 9
71row6        RN 10
72row7        RN 12
73row8        RN 14
74row9        RN 7
75
76tunpk0      RN 8
77tunpk1      RN 9
78tunpk2      RN 10
79tunpk3      RN 12
80tunpk4      RN 0
81
82tunpk5      RN 1
83tunpk6      RN 14
84tunpk7      RN 2
85tunpk8      RN 5
86tunpk9      RN 6
87
88
89;// Filtering
90
91dp0q0       RN 12
92dp1p0       RN 12
93dq1q0       RN 12
94dp2p0       RN 12
95dq2q0       RN 12
96
97ap0q0       RN 1
98filt        RN 2
99
100m00         RN 14
101m01         RN 11
102
103apflg       RN 0
104aqflg       RN 6
105apqflg      RN 0
106
107
108;//Declarations for bSLT4 kernel
109
110tC0         RN 7
111ptC0        RN 1
112
113pQ0a        RN 0
114Stepa       RN 1
115maska       RN 14
116
117P0a         RN 1
118P1a         RN 8
119Q0a         RN 7
120Q1a         RN 11
121
122;//Declarations for bSGE4 kernel
123
124pQ0b        RN 0
125Stepb       RN 1
126maskb       RN 14
127
128P0b         RN 6
129P1b         RN 7
130P2b         RN 1
131P3b         RN 3
132
133Q0b         RN 9
134Q1b         RN 0
135Q2b         RN 2
136Q3b         RN 3
137
138;// Miscellanous
139XY          RN 8
140t0          RN 3
141t1          RN 12
142t2          RN 14
143t7          RN 7
144t4          RN 4
145t5          RN 1
146t8          RN 6
147a           RN 0
148
149
150
151        ;// Allocate stack memory
152        M_ALLOC4 ppThresholds,4
153        M_ALLOC4 pQ_3,4
154        M_ALLOC4 pP_3,4
155        M_ALLOC8 pAlphaBeta0,8
156        M_ALLOC8 pAlphaBeta1,8
157        M_ALLOC8 pXYBS,4
158        M_ALLOC4 ppBS,4
159        M_ALLOC8 ppQ0Step,4
160        M_ALLOC4 pStep,4
161
162        ;// Function header
163        M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11
164
165        ;//Input arguments on the stack
166        M_ARG   ppThresholdsArg, 4
167        M_ARG   ppBSArg, 4
168
169        LDR     t4,=MASK_1
170
171        LDRB    alpha0, [pAlphaArg]
172        LDRB    beta0,  [pBetaArg]
173        LDRB    alpha1, [pAlphaArg,#1]
174        LDRB    beta1,  [pBetaArg,#1]
175
176        MUL     alpha0, alpha0, t4
177        MUL     beta0, beta0, t4
178        MUL     alpha1, alpha1, t4
179        MUL     beta1, beta1, t4
180
181        M_STRD  alpha0, beta0, pAlphaBeta0
182        M_STRD  alpha1, beta1, pAlphaBeta1
183
184        LDR     XY,=LOOP_COUNT
185        M_LDR   pBS, ppBSArg
186        M_LDR   pThresholds, ppThresholdsArg
187        M_STR   srcdstStep, pStep
188        M_STRD  XY, pBS, pXYBS
189        M_STR   pThresholds, ppThresholds
190
191        SUB     pQ0, pQ0, #4
192LoopY
193;//---------------Load Pixels-------------------
194
195;//----------------Pack p0-p3-----------------------
196        LDR     mask, =MASK_2
197
198        M_LDR   row0, [pQ0], srcdstStep
199        M_LDR   row1, [pQ0], srcdstStep
200        LDR     row2, [pQ0]
201        LDR     row3, [pQ0, srcdstStep]
202        SUB     pQ0, pQ0, srcdstStep, LSL #1
203
204        ;// row0 = [r0p0 r0p1 r0p2 r0p3]
205        ;// row1 = [r1p0 r1p1 r1p2 r1p3]
206        ;// row2 = [r2p0 r2p1 r2p2 r2p3]
207        ;// row3 = [r3p0 r3p1 r3p2 r3p3]
208
209        AND     tunpk0, mask, row0
210        AND     tunpk6, mask, row0, LSL#8
211        UXTAB16 tunpk0, tunpk0, row1, ROR#8
212        UXTAB16 tunpk6, tunpk6, row1
213        AND     tunpk2, mask, row2
214        AND     tunpk3, mask, row2, LSL#8
215        UXTAB16 tunpk2, tunpk2, row3, ROR#8
216        UXTAB16 tunpk3, tunpk3, row3
217
218        ;// tunpk0 = [r0p0 r1p0 r0p2 r1p2]
219        ;// tunpk6 = [r0p1 r1p1 r0p3 r1p3]
220        ;// tunpk2 = [r2p0 r3p0 r2p2 r3p2]
221        ;// tunpk3 = [r2p1 r3p1 r2p3 r3p3]
222
223        PKHTB   p_0, tunpk0, tunpk2, ASR#16
224        PKHTB   p_1, tunpk6, tunpk3, ASR#16
225        PKHBT   p_2, tunpk2, tunpk0, LSL#16
226        PKHBT   p_3, tunpk3, tunpk6, LSL#16
227
228
229        ;// p_0 = [r0p0 r1p0 r2p0 r3p0]
230        ;// p_1 = [r0p1 r1p1 r2p1 r3p1]
231        ;// p_2 = [r0p2 r1p2 r2p1 r3p2]
232        ;// p_3 = [r0p3 r1p3 r2p3 r3p3]
233
234        M_STR   p_3, pP_3
235
236;//----------------Pack q0-q3-----------------------
237LoopX
238        LDRB    bS, [pBS], #4
239        M_STR   pQ0, ppQ0Step
240        LDR     mask, =MASK_2
241        CMP     bS, #0
242        M_STR   pBS, ppBS
243
244        LDR     row4, [pQ0, #4]!
245        BEQ.W   NoFilterBS0
246        M_LDR   row5, [pQ0, srcdstStep]!
247        M_LDR   row6, [pQ0, srcdstStep]!
248        M_LDR   row7, [pQ0, srcdstStep]
249
250        ;// row4 = [r0q3 r0q2 r0q1 r0q0]
251        ;// row5 = [r1q3 r1q2 r1q1 r1q0]
252        ;// row6 = [r2q3 r2q2 r2q1 r2q0]
253        ;// row7 = [r3q3 r3q2 r3q1 r3q0]
254
255        AND     tunpk4, mask, row4
256        CMP     bS, #4
257        AND     tunpk5, mask, row4, LSL#8
258        UXTAB16 tunpk4, tunpk4, row5, ROR#8
259        UXTAB16 tunpk5, tunpk5, row5
260        AND     tunpk6, mask, row6
261        AND     tunpk7, mask, row6, LSL#8
262        UXTAB16 tunpk6, tunpk6, row7, ROR#8
263        UXTAB16 tunpk7, tunpk7, row7
264
265        ;// tunpk4 = [r0q0 r1q0 r0q2 r1q2]
266        ;// tunpk5 = [r0q1 r1q1 r0q3 r1q3]
267        ;// tunpk6 = [r2q0 r3q0 r2q2 r3q2]
268        ;// tunpk7 = [r2q1 r3q1 r2q3 r3q3]
269
270        PKHTB   q_3, tunpk4, tunpk6, ASR#16
271        PKHTB   q_2, tunpk5, tunpk7, ASR#16
272        PKHBT   q_1, tunpk6, tunpk4, LSL#16
273        M_STR   q_3, pQ_3
274        PKHBT   q_0, tunpk7, tunpk5, LSL#16
275
276
277        ;// q_0 = [r0q0 r1q0 r2q0 r3q0]
278        ;// q_1 = [r0q1 r1q1 r2q1 r3q1]
279        ;// q_2 = [r0q2 r1q2 r2q1 r3q2]
280        ;// q_3 = [r0q3 r1q3 r2q3 r3q3]
281
282
283;//--------------Filtering Decision -------------------
284        LDR     m01, =MASK_1                ;//  01010101 mask
285        MOV     m00, #MASK_0                ;//  00000000 mask
286
287        ;// Check |p0-q0|<Alpha
288        USUB8   dp0q0, p_0, q_0
289        USUB8   a, q_0, p_0
290        SEL     ap0q0, a, dp0q0
291        USUB8   a, ap0q0, alpha
292        SEL     filt, m00, m01
293
294        ;// Check |p1-p0|<Beta
295        USUB8   dp1p0, p_1, p_0
296        USUB8   a, p_0, p_1
297        SEL     a, a, dp1p0
298        USUB8   a, a, beta
299        SEL     filt, m00, filt
300
301        ;// Check |q1-q0|<Beta
302        USUB8   dq1q0, q_1, q_0
303        USUB8   a, q_0, q_1
304        SEL     a, a, dq1q0
305        USUB8   a, a, beta
306        SEL     filt, m00, filt
307
308        ;// Check ap<Beta
309        USUB8   dp2p0, p_2, p_0
310        USUB8   a, p_0, p_2
311        SEL     a, a, dp2p0
312        USUB8   a, a, beta
313        SEL     apflg, m00, filt            ;// apflg = filt && (ap<beta)
314
315        ;// Check aq<Beta
316        USUB8   dq2q0, q_2, q_0
317        USUB8   t2, q_0, q_2
318        SEL     t2, t2, dq2q0
319        USUB8   t2, t2, beta
320        MOV     t7,#0
321
322
323        BLT     bSLT4
324;//-------------------Filter--------------------
325bSGE4
326        ;//---------bSGE4 Execution---------------
327        SEL     t1, t7, filt            ;// aqflg = filt && (aq<beta)
328        CMP     filt, #0
329        ORR     apqflg, apflg, t1, LSL #1
330        M_LDRD  pQ0, srcdstStep, ppQ0Step, EQ
331        BEQ     NoFilterFilt0
332
333        BL      armVCM4P10_DeblockingLumabSGE4_unsafe
334
335        ;//---------Store result---------------
336
337        LDR     maskb,=MASK_2
338
339        ;// P0b = [r0p0 r1p0 r2p0 r3p0]
340        ;// P1b = [r0p1 r1p1 r2p1 r3p1]
341        ;// P2b = [r0p2 r1p2 r2p2 r3p2]
342        ;// P3b = [r0p3 r1p3 r2p3 r3p3]
343
344        M_LDR   P3b, pP_3
345        M_STR   Q0b, pP_3
346
347        ;//------Pack p0-p3------
348        AND     tunpk0, maskb, P0b
349        AND     tunpk2, maskb, P0b, LSL#8
350        UXTAB16 tunpk0, tunpk0, P1b, ROR#8
351        UXTAB16 tunpk2, tunpk2, P1b
352
353        AND     tunpk3, maskb, P2b
354        AND     tunpk8, maskb, P2b, LSL#8
355        UXTAB16 tunpk3, tunpk3, P3b, ROR#8
356        UXTAB16 tunpk8, tunpk8, P3b
357
358        ;// tunpk0 = [r0p0 r0p1 r2p0 r2p1]
359        ;// tunpk2 = [r1p0 r1p1 r3p0 r3p1]
360        ;// tunpk3 = [r0p2 r0p3 r2p2 r2p3]
361        ;// tunpk8 = [r1p2 r1p3 r3p2 r3p3]
362
363        MOV     p_2, Q1b
364        M_LDRD  pQ0b, Stepb, ppQ0Step
365
366        PKHTB   row9, tunpk0, tunpk3, ASR#16
367        PKHBT   row7, tunpk3, tunpk0, LSL#16
368        PKHTB   row3, tunpk2, tunpk8, ASR#16
369        PKHBT   row6, tunpk8, tunpk2, LSL#16
370
371        ;// row9 = [r0p0 r0p1 r0p2 r0p3]
372        ;// row3 = [r1p0 r1p1 r1p2 r1p3]
373        ;// row7 = [r2p0 r2p1 r2p2 r2p3]
374        ;// row6 = [r3p0 r3p1 r3p2 r3p3]
375
376        M_STR   row9, [pQ0b], Stepb
377        STR     row7, [pQ0b, Stepb]
378        STR     row6, [pQ0b, Stepb, LSL #1]
379        STR     row3, [pQ0b], #4
380
381        M_LDR   Q3b, pQ_3
382
383        ;// Q0b = [r0q0 r1q0 r2q0 r3q0]
384        ;// Q1b = [r0q1 r1q1 r2q1 r3q1]
385        ;// Q2b = [r0q2 r1q2 r2q2 r3q2]
386        ;// Q3b = [r0q3 r1q3 r2q3 r3q3]
387
388        ;//------Pack q0-q3------
389        AND     tunpk0, maskb, p_2
390        AND     tunpk2, maskb, p_2, LSL#8
391        UXTAB16 tunpk0, tunpk0, Q0b, ROR#8
392        UXTAB16 tunpk2, tunpk2, Q0b
393
394        AND     tunpk3, maskb, Q3b
395        AND     tunpk8, maskb, Q3b, LSL#8
396        UXTAB16 tunpk3, tunpk3, Q2b, ROR#8
397        UXTAB16 tunpk8, tunpk8, Q2b
398
399        ;// tunpk0 = [r0q1 r0q0 r2q1 r2q0]
400        ;// tunpk2 = [r1q1 r1q0 r3q1 r3q0]
401        ;// tunpk3 = [r0q3 r0q2 r2q3 r2q2]
402        ;// tunpk8 = [r1q3 r1q2 r3q3 r3q2]
403
404        PKHTB   row8, tunpk3, tunpk0, ASR#16
405        PKHBT   row7, tunpk0, tunpk3, LSL#16
406        PKHTB   row4, tunpk8, tunpk2, ASR#16
407        PKHBT   row6, tunpk2, tunpk8, LSL#16
408
409        ;// row8 = [r0q0 r0q1 r0q2 r0q3]
410        ;// row4 = [r1q0 r1q1 r1q2 r1q3]
411        ;// row7 = [r2q0 r2q1 r2q2 r2q3]
412        ;// row6 = [r3q0 r3q1 r3q2 r3q3]
413
414        STR     row4, [pQ0b]
415        STR     row7, [pQ0b, Stepb]
416        STR     row6, [pQ0b, Stepb, LSL #1]
417
418        SUB     pQ0, pQ0b, Stepb
419        MOV     p_1, Q2b
420
421        STR     row8, [pQ0]
422
423        M_LDRD  XY, pBS, pXYBS
424        M_LDR   pThresholds, ppThresholds
425        M_LDRD  alpha, beta, pAlphaBeta1
426
427        ADDS    XY, XY, XY
428        ADD     pThresholds, #4
429        M_STR   pThresholds, ppThresholds
430        M_STR   XY, pXYBS
431        BCC     LoopX
432        B       ExitLoopY
433
434;//---------- Exit of LoopX --------------
435;//---- for the case of no filtering -----
436
437NoFilterFilt0
438        ADD     pQ0, pQ0, #4
439NoFilterBS0
440        ;// Load counter for LoopX
441        M_LDRD  XY, pBS, pXYBS
442        M_LDR   pThresholds, ppThresholds
443        M_LDRD  alpha, beta, pAlphaBeta1
444
445        ;// Align the pointer
446        ADDS    XY, XY, XY
447        ADD     pThresholds, pThresholds, #4
448        M_STR   pThresholds, ppThresholds
449        M_STR   XY, pXYBS
450        BCC     LoopY
451        B       ExitLoopY
452
453bSLT4
454        ;//---------bSLT4 Execution---------------
455        SEL     aqflg, t7, filt            ;// aqflg = filt && (aq<beta)
456        M_LDR   ptC0, ppThresholds
457        CMP     filt, #0
458        M_LDRD  pQ0, srcdstStep, ppQ0Step, EQ
459        BEQ     NoFilterFilt0
460
461        LDRB    tC0, [ptC0], #4
462        M_STR   ptC0, ppThresholds
463
464        BL      armVCM4P10_DeblockingLumabSLT4_unsafe
465
466        ;//---------Store result---------------
467        ;//--------Pack p1,p0,q1,q0------------
468
469        ;//Load destination pointer
470        LDR     maska,=MASK_2
471        M_STR   Q0a, pP_3
472        MOV     p_1, q_2
473
474        ;// P1a = [r0p1 r1p1 r2p1 r3p1]
475        ;// P0a = [r0p0 r1p0 r2p0 r3p0]
476        ;// Q0a = [r0q0 r1q0 r2q0 r3q0]
477        ;// Q1a = [r0q1 r1q1 r2q1 r3q1]
478
479        AND     tunpk1, maska, P0a
480        AND     tunpk2, maska, P0a, LSL#8
481        UXTAB16 tunpk1, tunpk1, P1a, ROR#8
482        UXTAB16 tunpk2, tunpk2, P1a
483
484        M_LDRD  pQ0a, Stepa, ppQ0Step
485
486        AND     tunpk9, maska, Q1a
487        AND     tunpk3, maska, Q1a, LSL#8
488        UXTAB16 tunpk9, tunpk9, Q0a, ROR#8
489        UXTAB16 tunpk3, tunpk3, Q0a
490
491        ;// tunpk1 = [r0p0 r0p1 r2p0 r2p1]
492        ;// tunpk2 = [r1p0 r1p1 r3p0 r3p1]
493        ;// tunpk9 = [r0q1 r0q0 r2q1 r2q0]
494        ;// tunpk3 = [r1q1 r1q0 r3q1 r3q0]
495
496        MOV     t4, tunpk1, LSR #16
497        MOV     t0, tunpk9, LSR #16
498
499        STRH    t4,[pQ0a, #2]!          ;//Stores [r0p0 r0p1]
500        STRH    t0,[pQ0a, #2]           ;//Stores [r0q0 r0q1]
501
502        MOV     t4, tunpk2, LSR #16
503        MOV     t0, tunpk3, LSR #16
504
505        M_STRH  t4,[pQ0a, Stepa]!       ;//Stores [r1p0 r1p1]
506        STRH    t0,[pQ0a, #2]           ;//Stores [r1q0 r1q1]
507
508        M_STRH  tunpk1,[pQ0a, Stepa]!   ;//Stores [r2p0 r2p1]
509        STRH    tunpk2,[pQ0a, Stepa]    ;//Stores [r3p0 r3p1]
510        STRH    tunpk9,[pQ0a, #2]!        ;//Stores [r2q0 r2q1]
511        STRH    tunpk3,[pQ0a, Stepa]    ;//Stores [r3q0 r3q1]
512
513        SUB     pQ0, pQ0a, Stepa, LSL #1
514
515        ;// Load counter
516        M_LDRD  XY, pBS, pXYBS
517
518        ;// Reload Pixels
519        M_LDR   p_0, pQ_3
520        MOV     p_2, Q1a
521
522        M_LDRD  alpha, beta, pAlphaBeta1
523
524        ADDS    XY, XY, XY
525        M_STR   XY, pXYBS
526        BCC     LoopX
527
528;//-------- Common Exit of LoopY -----------------
529        ;// Align the pointers
530        M_LDR   pThresholds, ppThresholds
531ExitLoopY
532        SUB     pQ0, pQ0, #16
533        ADD     pQ0, pQ0, srcdstStep, LSL #2
534        SUB     pBS, pBS, #15
535        SUB     pThresholds, pThresholds, #15
536        M_STR   pThresholds, ppThresholds
537
538        M_LDRD  alpha, beta, pAlphaBeta0
539
540        BNE     LoopY
541        MOV     r0, #OMX_Sts_NoErr
542
543        M_END
544;//-----------------End Filter--------------------
545
546    ENDIF
547
548        END
549
550