armVCM4P10_DeblockingChroma_unsafe_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_DeblockingChroma_unsafe_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS CortexA8
31
32
33    IF  CortexA8
34
35pAlpha      RN 2
36pBeta       RN 3
37
38pThresholds RN 5
39pBS         RN 4
40bS3210      RN 6
41
42;// Pixels
43dP_0        DN D4.U8
44dP_1        DN D5.U8
45dP_2        DN D6.U8
46dP_3        DN D7.U8
47dQ_0        DN D8.U8
48dQ_1        DN D9.U8
49dQ_2        DN D10.U8
50dQ_3        DN D11.U8
51
52
53;// Filtering Decision
54dAlpha      DN D0.U8
55dBeta       DN D2.U8
56
57dFilt       DN D16.U8
58dAqflg      DN D12.U8
59dApflg      DN D17.U8
60
61dAp0q0      DN D13.U8
62
63;// bSLT4
64dTC3210     DN D18.U8
65dTCs        DN D31.S8
66dTC         DN D31.U8
67
68dMask_0     DN D14.U8
69dMask_1     DN D15.U8
70dMask_4     DN D26.U16
71
72dTemp       DN D28.U8
73dDummy      DN D17.U8
74
75;// Computing P0,Q0
76qDq0p0      QN Q10.S16
77qDp1q1      QN Q11.S16
78qDelta      QN Q10.S16  ; reuse qDq0p0
79dDelta      DN D20.S8
80
81
82;// Computing P1,Q1
83qP_0n       QN Q14.S16
84qQ_0n       QN Q12.S16
85
86dQ_0n       DN D24.U8
87dP_0n       DN D29.U8
88
89;// bSGE4
90
91dHSp0q1     DN D13.U8
92dHSq0p1     DN D31.U8
93
94dBS3210     DN D28.U16
95
96dP_0t       DN D13.U8   ;dHSp0q1
97dQ_0t       DN D31.U8   ;Temp1
98
99dP_0n       DN D29.U8
100dQ_0n       DN D24.U8   ;Temp2
101
102;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe
103;//
104;// Inputs - Pixels             - p0-p3: D4-D7, q0-q3: D8-D11
105;//        - Filter masks       - filt: D16, aqflg: D12, apflg: D17
106;//        - Additional Params  - pThresholds: r5
107;//
108;// Outputs - Pixels            - P0-P1: D29-D30, Q0-Q1: D24-D25
109;//         - Additional Params - pThresholds: r5
110
111;// Registers Corrupted         - D18-D31
112
113
114        M_START armVCM4P10_DeblockingChromabSLT4_unsafe
115
116
117        ;dTC3210 -18
118        ;dTemp-28
119
120        VLD1        d18.U32[0], [pThresholds]! ;here
121
122        ;// delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3;
123        ;// dDelta = (qDp1q1 >> 2 + qDq0p0 + 1)>> 1
124
125        ;// qDp1q1-11
126        ;// qDq0p0-10
127        VSUBL       qDp1q1, dP_1, dQ_1
128        VMOV        dTemp, dTC3210
129        VSUBL       qDq0p0, dQ_0, dP_0
130        VSHR        qDp1q1, qDp1q1, #2
131        VZIP.8      dTC3210, dTemp
132
133        ;// qDelta-qDq0p0-10
134
135        ;// dTC = dTC01 + (dAplg & 1) + (dAqflg & 1)
136
137        ;// dTC3210-18
138        ;// dTemp-28
139        ;// dTC-31
140        VBIF        dTC3210, dMask_0, dFilt
141        VRHADD      qDelta, qDp1q1, qDq0p0
142        VADD        dTC, dTC3210, dMask_1
143        VQMOVN      dDelta, qDelta
144        ;// dDelta-d20
145
146        ;// dDelta = (OMX_U8)armClip(0, 255, q0 - delta);
147        VLD1        {dAlpha[]}, [pAlpha]
148        VMIN        dDelta, dDelta, dTCs
149        VNEG        dTCs, dTCs
150        VLD1        {dBeta[]}, [pBeta]
151        ;1
152        VMAX        dDelta, dDelta, dTCs
153
154        ;// dP_0n - 29
155        ;// dQ_0n - 24
156
157        ;// pQ0[-1*Step] = (OMX_U8)armClip(0, 255, dP_0 - delta);
158        ;// pQ0[0*Step] = (OMX_U8)armClip(0, 255, dQ_0 - delta);
159
160        ;// dP_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
161        ;// dQ_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
162
163        ;// qP_0n - 14
164        ;// qQ_0n - 12
165
166        VMOVL       qP_0n, dP_0
167        VMOVL       qQ_0n, dQ_0
168
169        ;1
170        VADDW       qP_0n, qP_0n, dDelta
171        VSUBW       qQ_0n, qQ_0n, dDelta
172
173        VQMOVUN     dP_0n, qP_0n
174        VQMOVUN     dQ_0n, qQ_0n
175
176        M_END
177
178;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
179;//
180;// Inputs - Pixels             - p0-p3: D4-D7, q0-q3: D8-D11
181;//        - Filter masks       - filt: D16, aqflg: D12, apflg: D17
182;//        - Additional Params  - alpha: D0, dMask_1: D15
183;//
184;// Outputs - Pixels            - P0-P2: D29-D31, Q0-Q2: D24,D25,D28
185
186;// Registers Corrupted         - D18-D31
187
188        M_START armVCM4P10_DeblockingChromabSGE4_unsafe
189
190        ;dHSq0p1 - 31
191        ;dHSp0q1 - 13
192        VHADD       dHSp0q1, dP_0, dQ_1
193        VHADD       dHSq0p1, dQ_0, dP_1
194
195        ;// Prepare the bS mask
196
197        ;// dHSp0q1-13
198        ;// dP_0t-dHSp0q1-13
199        ;// dHSq0p1-31
200        ;// dQ_0t-Temp1-31
201        VLD1        {dAlpha[]}, [pAlpha]
202        ADD         pThresholds, pThresholds, #4
203        VLD1        {dBeta[]}, [pBeta]
204
205        VRHADD      dP_0t, dHSp0q1, dP_1
206        VRHADD      dQ_0t, dHSq0p1, dQ_1
207
208        M_END
209
210        ENDIF
211
212        END
213