omxVCM4P10_TransformDequantLumaDCFromPair_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26;// Description:
27;// H.264 inverse quantize and transform module
28;//
29;//
30
31;// Include standard headers
32
33        INCLUDE omxtypes_s.h
34        INCLUDE armCOMM_s.h
35
36;// Import/Export symbols required from/to other files
37;// (For example tables)
38
39        IMPORT armVCM4P10_UnpackBlock4x4
40        IMPORT armVCM4P10_QPDivTable
41        IMPORT armVCM4P10_VMatrixQPModTable
42
43        M_VARIANTS CortexA8
44
45;// Set debugging level
46;//DEBUG_ON    SETL {TRUE}
47
48
49;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
50
51
52;// Guarding implementation by the processor name
53
54
55
56;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
57
58;// Guarding implementation by the processor name
59
60    IF  CortexA8
61
62;//Input Registers
63pData               RN  0
64QP                  RN  1
65
66
67;//Local Scratch Registers
68
69;// ARM Registers
70
71pQPDivTable         RN  2
72pQPModTable         RN  3
73Shift               RN  4
74Scale               RN  5
75
76;// NEON Registers
77
78;// Packed Input pixels
79dIn0                DN  D0.S16
80dIn1                DN  D1.S16
81dIn2                DN  D2.S16
82dIn3                DN  D3.S16
83
84;// Intermediate calculations
85dRowSum1            DN  D4.S16
86dRowSum2            DN  D5.S16
87dRowDiff1           DN  D6.S16
88dRowDiff2           DN  D7.S16
89
90;// Row operated pixels
91dRowOp0             DN  D0.S16
92dRowOp1                DN  D1.S16
93dRowOp2                DN  D2.S16
94dRowOp3                DN  D3.S16
95qRowOp01            QN  Q0.32
96qRowOp23            QN  Q1.32
97
98;// Intermediate calculations
99dColSum1            DN  D4.S16
100dColSum2            DN  D5.S16
101dColDiff1           DN  D6.S16
102dColDiff2           DN  D7.S16
103
104;// Coloumn operated pixels
105dColOp0             DN  D0.S16
106dColOp1                DN  D1.S16
107dColOp2                DN  D2.S16
108dColOp3                DN  D3.S16
109
110;// Temporary scratch varaibles
111
112dScale              DN  D5.S16
113qRound0             QN  Q3.S32
114qRound1             QN  Q4.S32
115qRound2             QN  Q5.S32
116qRound3             QN  Q6.S32
117
118;// InvTransformed and Dequantized pixels
119dOut0               DN  D0.S16
120dOut1                DN  D1.S16
121dOut2                DN  D2.S16
122dOut3                DN  D3.S16
123
124
125    ;// Allocate stack memory required by the function
126
127
128    ;// Write function header
129    M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13
130
131    ;******************************************************************
132    ;// The strategy used in implementing the transform is as follows:*
133    ;// Load the 4x4 block into 4 D-registers                         *
134    ;// Transpose the 4x4 matrix                                      *
135    ;// Perform the row operations (on columns) using SIMD            *
136    ;// Transpose the 4x4 result matrix                               *
137    ;// Perform the coloumn operations                                *
138    ;******************************************************************
139
140        ;// Load all the 4x4 pixels in Transposed form
141
142        VLD4    {dIn0,dIn1,dIn2,dIn3},[pData]
143        LDR     pQPDivTable, =armVCM4P10_QPDivTable        ;// QP Division look-up-table base pointer
144        LDR     pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
145
146        ;****************************************
147        ;// Row Operations (Performed on columns)
148        ;****************************************
149        ;// Scale factor calculation is done using ARM instructions
150        ;// Interleaved with NEON instructions inorder to Dual issue
151
152        VADD    dRowSum1,dIn0,dIn1
153        VADD    dRowSum2,dIn2,dIn3
154        VSUB    dRowDiff1,dIn0,dIn1
155        LDRSB   Shift, [pQPDivTable, QP]               ;// ARM CODE: Shift = pQPDivTable[QP]
156        VSUB    dRowDiff2,dIn2,dIn3
157        LDRSB   Scale, [pQPModTable, QP]               ;// ARM CODE: Scale = pQPModTable[QP]
158        VADD    dRowOp0,dRowSum1,dRowSum2
159        VSUB    dRowOp1,dRowSum1,dRowSum2
160        VSUB    dRowOp2,dRowDiff1,dRowDiff2
161        LSL     Scale, Scale, Shift                    ;// ARM CODE: Scale = Scale << Shift
162        VADD    dRowOp3,dRowDiff1,dRowDiff2
163
164        ;****************************************
165        ;// Transpose the resultant matrix
166        ;****************************************
167
168        VTRN    dRowOp0,dRowOp1
169        VTRN    dRowOp2,dRowOp3
170        VTRN    qRowOp01,qRowOp23
171
172        ;****************************************
173        ;// Coloumn Operations
174        ;****************************************
175
176        VADD    dColSum1,dRowOp0,dRowOp1
177        VADD    dColSum2,dRowOp2,dRowOp3
178        VSUB    dColDiff1,dRowOp0,dRowOp1
179        VSUB    dColDiff2,dRowOp2,dRowOp3
180        VADD    dColOp0,dColSum1,dColSum2
181        VSUB    dColOp1,dColSum1,dColSum2
182        VSUB    dColOp2,dColDiff1,dColDiff2
183        VADD    dColOp3,dColDiff1,dColDiff2
184
185        ;//----------------------------------------------------------------------
186        ;//
187        ;// <Dequantize> improves on the c-reference code
188        ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
189        ;// We do not subtract 2 from Shift as in C reference, instead perform a
190        ;// Scale << Shift once in the beginning and do a right shift by a
191        ;// constant 2 after the Multiplication. The value of Round would be 2
192        ;//
193        ;// By doing this we aviod the Branches required and also
194        ;// reduce the code size substantially
195        ;//
196        ;//----------------------------------------------------------------------
197
198
199        VDUP    dScale, Scale                            ;// ARM -> NEON  copy 'scale' to vector
200
201
202        VMOV    qRound0,#2                               ;// Set the Round Value
203        VMOV    qRound1,#2
204        VMOV    qRound2,#2
205        VMOV    qRound3,#2
206
207        VMLAL   qRound0,dColOp0,dScale                   ;// pDst[i] * Scale + Round
208        VMLAL   qRound1,dColOp1,dScale
209        VMLAL   qRound2,dColOp2,dScale
210        VMLAL   qRound3,dColOp3,dScale
211
212        VSHRN   dOut0,qRound0,#2                          ;// Right shift by 2 & (OMX_S16)Value
213        VSHRN   dOut1,qRound1,#2
214        VSHRN   dOut2,qRound2,#2
215        VSHRN   dOut3,qRound3,#2
216
217        ;***************************
218        ;// Store all the 4x4 pixels
219        ;***************************
220
221        VST1  {dOut0,dOut1,dOut2,dOut3}, [pData]
222
223
224        ;// Set return value
225
226        ;// Write function tail
227        M_END
228
229    ENDIF                                                           ;//CORTEXA8
230
231
232
233;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
234
235;//Input Registers
236ppSrc               RN  0
237pDst                RN  1
238QPR2                RN  2
239
240;//Output Registers
241result              RN  0
242
243;//Local Scratch Registers
244pDstR4              RN  4
245pDstR0              RN  0
246QPR1                RN  1
247QPR5                RN  5
248
249;// Guarding implementation by the processor name
250
251    IF CortexA8
252
253    ;// Allocate stack memory required by the function
254
255
256    ;// Write function header
257        M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
258
259        MOV     pDstR4,pDst                         ;// Saving register r1
260        MOV     QPR5,QPR2                           ;// Saving register r2
261        BL      armVCM4P10_UnpackBlock4x4
262
263        MOV     pDstR0,pDstR4                       ;// Setting up register r0
264        MOV     QPR1,QPR5                           ;// Setting up register r1
265        BL      armVCM4P10_InvTransformDequantLumaDC4x4
266
267
268        ;// Set return value
269        MOV     result,#OMX_Sts_NoErr
270
271        ;// Write function tail
272        M_END
273
274
275    ENDIF                                                           ;//ARM1136JS
276
277
278    END
279