omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   9641
21;// Date:       Thursday, February 7, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS ARM1136JS
31
32        IMPORT  armVCM4P10_DeblockingLumabSLT4_unsafe
33        IMPORT  armVCM4P10_DeblockingLumabSGE4_unsafe
34
35
36    IF ARM1136JS
37
38MASK_0      EQU 0x00000000
39MASK_1      EQU 0x01010101
40MASK_2      EQU 0xff00ff00
41LOOP_COUNT  EQU 0x11110000
42
43;// Declare input registers
44
45pSrcDst     RN 0
46srcdstStep  RN 1
47pAlphaArg   RN 2
48pBetaArg    RN 3
49
50pThresholds RN 14
51pBS         RN 9
52pQ0         RN 0
53bS          RN 2
54
55alpha       RN 6
56alpha0      RN 6
57alpha1      RN 8
58
59beta        RN 7
60beta0       RN 7
61beta1       RN 9
62
63;// Declare Local/Temporary variables
64
65;// Pixels
66p_0         RN 3
67p_1         RN 5
68p_2         RN 4
69p_3         RN 2
70q_0         RN 8
71q_1         RN 9
72q_2         RN 10
73q_3         RN 12
74
75;// Unpacking
76mask        RN 11
77
78row0        RN 2
79row1        RN 4
80row2        RN 5
81row3        RN 3
82
83row4        RN 8
84row5        RN 9
85row6        RN 10
86row7        RN 12
87row8        RN 14
88row9        RN 7
89
90tunpk0      RN 8
91tunpk1      RN 9
92tunpk2      RN 10
93tunpk3      RN 12
94tunpk4      RN 0
95
96tunpk5      RN 1
97tunpk6      RN 14
98tunpk7      RN 2
99tunpk8      RN 5
100tunpk9      RN 6
101
102
103;// Filtering
104
105dp0q0       RN 12
106dp1p0       RN 12
107dq1q0       RN 12
108dp2p0       RN 12
109dq2q0       RN 12
110
111ap0q0       RN 1
112filt        RN 2
113
114m00         RN 14
115m01         RN 11
116
117apflg       RN 0
118aqflg       RN 6
119apqflg      RN 0
120
121
122;//Declarations for bSLT4 kernel
123
124tC0         RN 7
125ptC0        RN 1
126
127pQ0a        RN 0
128Stepa       RN 1
129maska       RN 14
130
131P0a         RN 1
132P1a         RN 8
133Q0a         RN 7
134Q1a         RN 11
135
136;//Declarations for bSGE4 kernel
137
138pQ0b        RN 0
139Stepb       RN 1
140maskb       RN 14
141
142P0b         RN 6
143P1b         RN 7
144P2b         RN 1
145P3b         RN 3
146
147Q0b         RN 9
148Q1b         RN 0
149Q2b         RN 2
150Q3b         RN 3
151
152;// Miscellanous
153XY          RN 8
154t0          RN 3
155t1          RN 12
156t2          RN 14
157t7          RN 7
158t4          RN 4
159t5          RN 1
160t8          RN 6
161a           RN 0
162
163
164
165        ;// Allocate stack memory
166        M_ALLOC4 ppThresholds,4
167        M_ALLOC4 pQ_3,4
168        M_ALLOC4 pP_3,4
169        M_ALLOC8 pAlphaBeta0,8
170        M_ALLOC8 pAlphaBeta1,8
171        M_ALLOC8 pXYBS,4
172        M_ALLOC4 ppBS,4
173        M_ALLOC8 ppQ0Step,4
174        M_ALLOC4 pStep,4
175
176        ;// Function header
177        M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11
178
179        ;//Input arguments on the stack
180        M_ARG   ppThresholdsArg, 4
181        M_ARG   ppBSArg, 4
182
183        LDR     t4,=MASK_1
184
185        LDRB    alpha0, [pAlphaArg]
186        LDRB    beta0,  [pBetaArg]
187        LDRB    alpha1, [pAlphaArg,#1]
188        LDRB    beta1,  [pBetaArg,#1]
189
190        MUL     alpha0, alpha0, t4
191        MUL     beta0, beta0, t4
192        MUL     alpha1, alpha1, t4
193        MUL     beta1, beta1, t4
194
195        M_STRD  alpha0, beta0, pAlphaBeta0
196        M_STRD  alpha1, beta1, pAlphaBeta1
197
198        LDR     XY,=LOOP_COUNT
199        M_LDR   pBS, ppBSArg
200        M_LDR   pThresholds, ppThresholdsArg
201        M_STR   srcdstStep, pStep
202        M_STRD  XY, pBS, pXYBS
203        M_STR   pThresholds, ppThresholds
204
205        SUB     pQ0, pQ0, #4
206LoopY
207;//---------------Load Pixels-------------------
208
209;//----------------Pack p0-p3-----------------------
210        LDR     mask, =MASK_2
211
212        M_LDR   row0, [pQ0], srcdstStep
213        M_LDR   row1, [pQ0], srcdstStep
214        LDR     row2, [pQ0]
215        LDR     row3, [pQ0, srcdstStep]
216        SUB     pQ0, pQ0, srcdstStep, LSL #1
217
218        ;// row0 = [r0p0 r0p1 r0p2 r0p3]
219        ;// row1 = [r1p0 r1p1 r1p2 r1p3]
220        ;// row2 = [r2p0 r2p1 r2p2 r2p3]
221        ;// row3 = [r3p0 r3p1 r3p2 r3p3]
222
223        AND     tunpk0, mask, row0
224        AND     tunpk6, mask, row0, LSL#8
225        UXTAB16 tunpk0, tunpk0, row1, ROR#8
226        UXTAB16 tunpk6, tunpk6, row1
227        AND     tunpk2, mask, row2
228        AND     tunpk3, mask, row2, LSL#8
229        UXTAB16 tunpk2, tunpk2, row3, ROR#8
230        UXTAB16 tunpk3, tunpk3, row3
231
232        ;// tunpk0 = [r0p0 r1p0 r0p2 r1p2]
233        ;// tunpk6 = [r0p1 r1p1 r0p3 r1p3]
234        ;// tunpk2 = [r2p0 r3p0 r2p2 r3p2]
235        ;// tunpk3 = [r2p1 r3p1 r2p3 r3p3]
236
237        PKHTB   p_0, tunpk0, tunpk2, ASR#16
238        PKHTB   p_1, tunpk6, tunpk3, ASR#16
239        PKHBT   p_2, tunpk2, tunpk0, LSL#16
240        PKHBT   p_3, tunpk3, tunpk6, LSL#16
241
242
243        ;// p_0 = [r0p0 r1p0 r2p0 r3p0]
244        ;// p_1 = [r0p1 r1p1 r2p1 r3p1]
245        ;// p_2 = [r0p2 r1p2 r2p1 r3p2]
246        ;// p_3 = [r0p3 r1p3 r2p3 r3p3]
247
248        M_STR   p_3, pP_3
249
250;//----------------Pack q0-q3-----------------------
251LoopX
252        LDRB    bS, [pBS], #4
253        M_STR   pQ0, ppQ0Step
254        LDR     mask, =MASK_2
255        CMP     bS, #0
256        M_STR   pBS, ppBS
257
258        LDR     row4, [pQ0, #4]!
259        BEQ.W   NoFilterBS0
260        M_LDR   row5, [pQ0, srcdstStep]!
261        M_LDR   row6, [pQ0, srcdstStep]!
262        M_LDR   row7, [pQ0, srcdstStep]
263
264        ;// row4 = [r0q3 r0q2 r0q1 r0q0]
265        ;// row5 = [r1q3 r1q2 r1q1 r1q0]
266        ;// row6 = [r2q3 r2q2 r2q1 r2q0]
267        ;// row7 = [r3q3 r3q2 r3q1 r3q0]
268
269        AND     tunpk4, mask, row4
270        CMP     bS, #4
271        AND     tunpk5, mask, row4, LSL#8
272        UXTAB16 tunpk4, tunpk4, row5, ROR#8
273        UXTAB16 tunpk5, tunpk5, row5
274        AND     tunpk6, mask, row6
275        AND     tunpk7, mask, row6, LSL#8
276        UXTAB16 tunpk6, tunpk6, row7, ROR#8
277        UXTAB16 tunpk7, tunpk7, row7
278
279        ;// tunpk4 = [r0q0 r1q0 r0q2 r1q2]
280        ;// tunpk5 = [r0q1 r1q1 r0q3 r1q3]
281        ;// tunpk6 = [r2q0 r3q0 r2q2 r3q2]
282        ;// tunpk7 = [r2q1 r3q1 r2q3 r3q3]
283
284        PKHTB   q_3, tunpk4, tunpk6, ASR#16
285        PKHTB   q_2, tunpk5, tunpk7, ASR#16
286        PKHBT   q_1, tunpk6, tunpk4, LSL#16
287        M_STR   q_3, pQ_3
288        PKHBT   q_0, tunpk7, tunpk5, LSL#16
289
290
291        ;// q_0 = [r0q0 r1q0 r2q0 r3q0]
292        ;// q_1 = [r0q1 r1q1 r2q1 r3q1]
293        ;// q_2 = [r0q2 r1q2 r2q1 r3q2]
294        ;// q_3 = [r0q3 r1q3 r2q3 r3q3]
295
296
297;//--------------Filtering Decision -------------------
298        LDR     m01, =MASK_1                ;//  01010101 mask
299        MOV     m00, #MASK_0                ;//  00000000 mask
300
301        ;// Check |p0-q0|<Alpha
302        USUB8   dp0q0, p_0, q_0
303        USUB8   a, q_0, p_0
304        SEL     ap0q0, a, dp0q0
305        USUB8   a, ap0q0, alpha
306        SEL     filt, m00, m01
307
308        ;// Check |p1-p0|<Beta
309        USUB8   dp1p0, p_1, p_0
310        USUB8   a, p_0, p_1
311        SEL     a, a, dp1p0
312        USUB8   a, a, beta
313        SEL     filt, m00, filt
314
315        ;// Check |q1-q0|<Beta
316        USUB8   dq1q0, q_1, q_0
317        USUB8   a, q_0, q_1
318        SEL     a, a, dq1q0
319        USUB8   a, a, beta
320        SEL     filt, m00, filt
321
322        ;// Check ap<Beta
323        USUB8   dp2p0, p_2, p_0
324        USUB8   a, p_0, p_2
325        SEL     a, a, dp2p0
326        USUB8   a, a, beta
327        SEL     apflg, m00, filt            ;// apflg = filt && (ap<beta)
328
329        ;// Check aq<Beta
330        USUB8   dq2q0, q_2, q_0
331        USUB8   t2, q_0, q_2
332        SEL     t2, t2, dq2q0
333        USUB8   t2, t2, beta
334        MOV     t7,#0
335
336
337        BLT     bSLT4
338;//-------------------Filter--------------------
339bSGE4
340        ;//---------bSGE4 Execution---------------
341        SEL     t1, t7, filt            ;// aqflg = filt && (aq<beta)
342        CMP     filt, #0
343        ORR     apqflg, apflg, t1, LSL #1
344        M_LDRD  pQ0, srcdstStep, ppQ0Step, EQ
345        BEQ     NoFilterFilt0
346
347        BL      armVCM4P10_DeblockingLumabSGE4_unsafe
348
349        ;//---------Store result---------------
350
351        LDR     maskb,=MASK_2
352
353        ;// P0b = [r0p0 r1p0 r2p0 r3p0]
354        ;// P1b = [r0p1 r1p1 r2p1 r3p1]
355        ;// P2b = [r0p2 r1p2 r2p2 r3p2]
356        ;// P3b = [r0p3 r1p3 r2p3 r3p3]
357
358        M_LDR   P3b, pP_3
359        M_STR   Q0b, pP_3
360
361        ;//------Pack p0-p3------
362        AND     tunpk0, maskb, P0b
363        AND     tunpk2, maskb, P0b, LSL#8
364        UXTAB16 tunpk0, tunpk0, P1b, ROR#8
365        UXTAB16 tunpk2, tunpk2, P1b
366
367        AND     tunpk3, maskb, P2b
368        AND     tunpk8, maskb, P2b, LSL#8
369        UXTAB16 tunpk3, tunpk3, P3b, ROR#8
370        UXTAB16 tunpk8, tunpk8, P3b
371
372        ;// tunpk0 = [r0p0 r0p1 r2p0 r2p1]
373        ;// tunpk2 = [r1p0 r1p1 r3p0 r3p1]
374        ;// tunpk3 = [r0p2 r0p3 r2p2 r2p3]
375        ;// tunpk8 = [r1p2 r1p3 r3p2 r3p3]
376
377        MOV     p_2, Q1b
378        M_LDRD  pQ0b, Stepb, ppQ0Step
379
380        PKHTB   row9, tunpk0, tunpk3, ASR#16
381        PKHBT   row7, tunpk3, tunpk0, LSL#16
382        PKHTB   row3, tunpk2, tunpk8, ASR#16
383        PKHBT   row6, tunpk8, tunpk2, LSL#16
384
385        ;// row9 = [r0p0 r0p1 r0p2 r0p3]
386        ;// row3 = [r1p0 r1p1 r1p2 r1p3]
387        ;// row7 = [r2p0 r2p1 r2p2 r2p3]
388        ;// row6 = [r3p0 r3p1 r3p2 r3p3]
389
390        M_STR   row9, [pQ0b], Stepb
391        STR     row7, [pQ0b, Stepb]
392        STR     row6, [pQ0b, Stepb, LSL #1]
393        STR     row3, [pQ0b], #4
394
395        M_LDR   Q3b, pQ_3
396
397        ;// Q0b = [r0q0 r1q0 r2q0 r3q0]
398        ;// Q1b = [r0q1 r1q1 r2q1 r3q1]
399        ;// Q2b = [r0q2 r1q2 r2q2 r3q2]
400        ;// Q3b = [r0q3 r1q3 r2q3 r3q3]
401
402        ;//------Pack q0-q3------
403        AND     tunpk0, maskb, p_2
404        AND     tunpk2, maskb, p_2, LSL#8
405        UXTAB16 tunpk0, tunpk0, Q0b, ROR#8
406        UXTAB16 tunpk2, tunpk2, Q0b
407
408        AND     tunpk3, maskb, Q3b
409        AND     tunpk8, maskb, Q3b, LSL#8
410        UXTAB16 tunpk3, tunpk3, Q2b, ROR#8
411        UXTAB16 tunpk8, tunpk8, Q2b
412
413        ;// tunpk0 = [r0q1 r0q0 r2q1 r2q0]
414        ;// tunpk2 = [r1q1 r1q0 r3q1 r3q0]
415        ;// tunpk3 = [r0q3 r0q2 r2q3 r2q2]
416        ;// tunpk8 = [r1q3 r1q2 r3q3 r3q2]
417
418        PKHTB   row8, tunpk3, tunpk0, ASR#16
419        PKHBT   row7, tunpk0, tunpk3, LSL#16
420        PKHTB   row4, tunpk8, tunpk2, ASR#16
421        PKHBT   row6, tunpk2, tunpk8, LSL#16
422
423        ;// row8 = [r0q0 r0q1 r0q2 r0q3]
424        ;// row4 = [r1q0 r1q1 r1q2 r1q3]
425        ;// row7 = [r2q0 r2q1 r2q2 r2q3]
426        ;// row6 = [r3q0 r3q1 r3q2 r3q3]
427
428        STR     row4, [pQ0b]
429        STR     row7, [pQ0b, Stepb]
430        STR     row6, [pQ0b, Stepb, LSL #1]
431
432        SUB     pQ0, pQ0b, Stepb
433        MOV     p_1, Q2b
434
435        STR     row8, [pQ0]
436
437        M_LDRD  XY, pBS, pXYBS
438        M_LDR   pThresholds, ppThresholds
439        M_LDRD  alpha, beta, pAlphaBeta1
440
441        ADDS    XY, XY, XY
442        ADD     pThresholds, #4
443        M_STR   pThresholds, ppThresholds
444        M_STR   XY, pXYBS
445        BCC     LoopX
446        B       ExitLoopY
447
448;//---------- Exit of LoopX --------------
449;//---- for the case of no filtering -----
450
451NoFilterFilt0
452        ADD     pQ0, pQ0, #4
453NoFilterBS0
454        ;// Load counter for LoopX
455        M_LDRD  XY, pBS, pXYBS
456        M_LDR   pThresholds, ppThresholds
457        M_LDRD  alpha, beta, pAlphaBeta1
458
459        ;// Align the pointer
460        ADDS    XY, XY, XY
461        ADD     pThresholds, pThresholds, #4
462        M_STR   pThresholds, ppThresholds
463        M_STR   XY, pXYBS
464        BCC     LoopY
465        B       ExitLoopY
466
467bSLT4
468        ;//---------bSLT4 Execution---------------
469        SEL     aqflg, t7, filt            ;// aqflg = filt && (aq<beta)
470        M_LDR   ptC0, ppThresholds
471        CMP     filt, #0
472        M_LDRD  pQ0, srcdstStep, ppQ0Step, EQ
473        BEQ     NoFilterFilt0
474
475        LDRB    tC0, [ptC0], #4
476        M_STR   ptC0, ppThresholds
477
478        BL      armVCM4P10_DeblockingLumabSLT4_unsafe
479
480        ;//---------Store result---------------
481        ;//--------Pack p1,p0,q1,q0------------
482
483        ;//Load destination pointer
484        LDR     maska,=MASK_2
485        M_STR   Q0a, pP_3
486        MOV     p_1, q_2
487
488        ;// P1a = [r0p1 r1p1 r2p1 r3p1]
489        ;// P0a = [r0p0 r1p0 r2p0 r3p0]
490        ;// Q0a = [r0q0 r1q0 r2q0 r3q0]
491        ;// Q1a = [r0q1 r1q1 r2q1 r3q1]
492
493        AND     tunpk1, maska, P0a
494        AND     tunpk2, maska, P0a, LSL#8
495        UXTAB16 tunpk1, tunpk1, P1a, ROR#8
496        UXTAB16 tunpk2, tunpk2, P1a
497
498        M_LDRD  pQ0a, Stepa, ppQ0Step
499
500        AND     tunpk9, maska, Q1a
501        AND     tunpk3, maska, Q1a, LSL#8
502        UXTAB16 tunpk9, tunpk9, Q0a, ROR#8
503        UXTAB16 tunpk3, tunpk3, Q0a
504
505        ;// tunpk1 = [r0p0 r0p1 r2p0 r2p1]
506        ;// tunpk2 = [r1p0 r1p1 r3p0 r3p1]
507        ;// tunpk9 = [r0q1 r0q0 r2q1 r2q0]
508        ;// tunpk3 = [r1q1 r1q0 r3q1 r3q0]
509
510        MOV     t4, tunpk1, LSR #16
511        MOV     t0, tunpk9, LSR #16
512
513        STRH    t4,[pQ0a, #2]!          ;//Stores [r0p0 r0p1]
514        STRH    t0,[pQ0a, #2]           ;//Stores [r0q0 r0q1]
515
516        MOV     t4, tunpk2, LSR #16
517        MOV     t0, tunpk3, LSR #16
518
519        M_STRH  t4,[pQ0a, Stepa]!       ;//Stores [r1p0 r1p1]
520        STRH    t0,[pQ0a, #2]           ;//Stores [r1q0 r1q1]
521
522        M_STRH  tunpk1,[pQ0a, Stepa]!   ;//Stores [r2p0 r2p1]
523        STRH    tunpk2,[pQ0a, Stepa]    ;//Stores [r3p0 r3p1]
524        STRH    tunpk9,[pQ0a, #2]!        ;//Stores [r2q0 r2q1]
525        STRH    tunpk3,[pQ0a, Stepa]    ;//Stores [r3q0 r3q1]
526
527        SUB     pQ0, pQ0a, Stepa, LSL #1
528
529        ;// Load counter
530        M_LDRD  XY, pBS, pXYBS
531
532        ;// Reload Pixels
533        M_LDR   p_0, pQ_3
534        MOV     p_2, Q1a
535
536        M_LDRD  alpha, beta, pAlphaBeta1
537
538        ADDS    XY, XY, XY
539        M_STR   XY, pXYBS
540        BCC     LoopX
541
542;//-------- Common Exit of LoopY -----------------
543        ;// Align the pointers
544        M_LDR   pThresholds, ppThresholds
545ExitLoopY
546        SUB     pQ0, pQ0, #16
547        ADD     pQ0, pQ0, srcdstStep, LSL #2
548        SUB     pBS, pBS, #15
549        SUB     pThresholds, pThresholds, #15
550        M_STR   pThresholds, ppThresholds
551
552        M_LDRD  alpha, beta, pAlphaBeta0
553
554        BNE     LoopY
555        MOV     r0, #OMX_Sts_NoErr
556
557        M_END
558;//-----------------End Filter--------------------
559
560    ENDIF
561
562        END
563
564
565