1;//
2;//
3;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12;// Description:
13;// H.264 inverse quantize and transform module
14;//
15;//
16
17;// Include standard headers
18
19        INCLUDE omxtypes_s.h
20        INCLUDE armCOMM_s.h
21
22;// Import/Export symbols required from/to other files
23;// (For example tables)
24
25        IMPORT armVCM4P10_UnpackBlock4x4
26        IMPORT armVCM4P10_QPDivTable
27        IMPORT armVCM4P10_VMatrixQPModTable
28
29        M_VARIANTS CortexA8
30
31;// Set debugging level
32;//DEBUG_ON    SETL {TRUE}
33
34
35;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
36
37
38;// Guarding implementation by the processor name
39
40
41
42;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
43
44;// Guarding implementation by the processor name
45
46    IF  CortexA8
47
48;//Input Registers
49pData               RN  0
50QP                  RN  1
51
52
53;//Local Scratch Registers
54
55;// ARM Registers
56
57pQPDivTable         RN  2
58pQPModTable         RN  3
59Shift               RN  4
60Scale               RN  5
61
62;// NEON Registers
63
64;// Packed Input pixels
65dIn0                DN  D0.S16
66dIn1                DN  D1.S16
67dIn2                DN  D2.S16
68dIn3                DN  D3.S16
69
70;// Intermediate calculations
71dRowSum1            DN  D4.S16
72dRowSum2            DN  D5.S16
73dRowDiff1           DN  D6.S16
74dRowDiff2           DN  D7.S16
75
76;// Row operated pixels
77dRowOp0             DN  D0.S16
78dRowOp1                DN  D1.S16
79dRowOp2                DN  D2.S16
80dRowOp3                DN  D3.S16
81qRowOp01            QN  Q0.32
82qRowOp23            QN  Q1.32
83
84;// Intermediate calculations
85dColSum1            DN  D4.S16
86dColSum2            DN  D5.S16
87dColDiff1           DN  D6.S16
88dColDiff2           DN  D7.S16
89
90;// Coloumn operated pixels
91dColOp0             DN  D0.S16
92dColOp1                DN  D1.S16
93dColOp2                DN  D2.S16
94dColOp3                DN  D3.S16
95
96;// Temporary scratch varaibles
97
98dScale              DN  D5.S16
99qRound0             QN  Q3.S32
100qRound1             QN  Q4.S32
101qRound2             QN  Q5.S32
102qRound3             QN  Q6.S32
103
104;// InvTransformed and Dequantized pixels
105dOut0               DN  D0.S16
106dOut1                DN  D1.S16
107dOut2                DN  D2.S16
108dOut3                DN  D3.S16
109
110
111    ;// Allocate stack memory required by the function
112
113
114    ;// Write function header
115    M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13
116
117    ;******************************************************************
118    ;// The strategy used in implementing the transform is as follows:*
119    ;// Load the 4x4 block into 4 D-registers                         *
120    ;// Transpose the 4x4 matrix                                      *
121    ;// Perform the row operations (on columns) using SIMD            *
122    ;// Transpose the 4x4 result matrix                               *
123    ;// Perform the coloumn operations                                *
124    ;******************************************************************
125
126        ;// Load all the 4x4 pixels in Transposed form
127
128        VLD4    {dIn0,dIn1,dIn2,dIn3},[pData]
129        LDR     pQPDivTable, =armVCM4P10_QPDivTable        ;// QP Division look-up-table base pointer
130        LDR     pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
131
132        ;****************************************
133        ;// Row Operations (Performed on columns)
134        ;****************************************
135        ;// Scale factor calculation is done using ARM instructions
136        ;// Interleaved with NEON instructions inorder to Dual issue
137
138        VADD    dRowSum1,dIn0,dIn1
139        VADD    dRowSum2,dIn2,dIn3
140        VSUB    dRowDiff1,dIn0,dIn1
141        LDRSB   Shift, [pQPDivTable, QP]               ;// ARM CODE: Shift = pQPDivTable[QP]
142        VSUB    dRowDiff2,dIn2,dIn3
143        LDRSB   Scale, [pQPModTable, QP]               ;// ARM CODE: Scale = pQPModTable[QP]
144        VADD    dRowOp0,dRowSum1,dRowSum2
145        VSUB    dRowOp1,dRowSum1,dRowSum2
146        VSUB    dRowOp2,dRowDiff1,dRowDiff2
147        LSL     Scale, Scale, Shift                    ;// ARM CODE: Scale = Scale << Shift
148        VADD    dRowOp3,dRowDiff1,dRowDiff2
149
150        ;****************************************
151        ;// Transpose the resultant matrix
152        ;****************************************
153
154        VTRN    dRowOp0,dRowOp1
155        VTRN    dRowOp2,dRowOp3
156        VTRN    qRowOp01,qRowOp23
157
158        ;****************************************
159        ;// Coloumn Operations
160        ;****************************************
161
162        VADD    dColSum1,dRowOp0,dRowOp1
163        VADD    dColSum2,dRowOp2,dRowOp3
164        VSUB    dColDiff1,dRowOp0,dRowOp1
165        VSUB    dColDiff2,dRowOp2,dRowOp3
166        VADD    dColOp0,dColSum1,dColSum2
167        VSUB    dColOp1,dColSum1,dColSum2
168        VSUB    dColOp2,dColDiff1,dColDiff2
169        VADD    dColOp3,dColDiff1,dColDiff2
170
171        ;//----------------------------------------------------------------------
172        ;//
173        ;// <Dequantize> improves on the c-reference code
174        ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
175        ;// We do not subtract 2 from Shift as in C reference, instead perform a
176        ;// Scale << Shift once in the beginning and do a right shift by a
177        ;// constant 2 after the Multiplication. The value of Round would be 2
178        ;//
179        ;// By doing this we aviod the Branches required and also
180        ;// reduce the code size substantially
181        ;//
182        ;//----------------------------------------------------------------------
183
184
185        VDUP    dScale, Scale                            ;// ARM -> NEON  copy 'scale' to vector
186
187
188        VMOV    qRound0,#2                               ;// Set the Round Value
189        VMOV    qRound1,#2
190        VMOV    qRound2,#2
191        VMOV    qRound3,#2
192
193        VMLAL   qRound0,dColOp0,dScale                   ;// pDst[i] * Scale + Round
194        VMLAL   qRound1,dColOp1,dScale
195        VMLAL   qRound2,dColOp2,dScale
196        VMLAL   qRound3,dColOp3,dScale
197
198        VSHRN   dOut0,qRound0,#2                          ;// Right shift by 2 & (OMX_S16)Value
199        VSHRN   dOut1,qRound1,#2
200        VSHRN   dOut2,qRound2,#2
201        VSHRN   dOut3,qRound3,#2
202
203        ;***************************
204        ;// Store all the 4x4 pixels
205        ;***************************
206
207        VST1  {dOut0,dOut1,dOut2,dOut3}, [pData]
208
209
210        ;// Set return value
211
212        ;// Write function tail
213        M_END
214
215    ENDIF                                                           ;//CORTEXA8
216
217
218
219;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
220
221;//Input Registers
222ppSrc               RN  0
223pDst                RN  1
224QPR2                RN  2
225
226;//Output Registers
227result              RN  0
228
229;//Local Scratch Registers
230pDstR4              RN  4
231pDstR0              RN  0
232QPR1                RN  1
233QPR5                RN  5
234
235;// Guarding implementation by the processor name
236
237    IF CortexA8
238
239    ;// Allocate stack memory required by the function
240
241
242    ;// Write function header
243        M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
244
245        MOV     pDstR4,pDst                         ;// Saving register r1
246        MOV     QPR5,QPR2                           ;// Saving register r2
247        BL      armVCM4P10_UnpackBlock4x4
248
249        MOV     pDstR0,pDstR4                       ;// Setting up register r0
250        MOV     QPR1,QPR5                           ;// Setting up register r1
251        BL      armVCM4P10_InvTransformDequantLumaDC4x4
252
253
254        ;// Set return value
255        MOV     result,#OMX_Sts_NoErr
256
257        ;// Write function tail
258        M_END
259
260
261    ENDIF                                                           ;//ARM1136JS
262
263
264    END