omxVCM4P10_TransformDequantLumaDCFromPair_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   9641
21;// Date:       Thursday, February 7, 2008
22;//
23;//
24;//
25;//
26;// Description:
27;// H.264 inverse quantize and transform module
28;//
29;//
30
31;// Include standard headers
32
33        INCLUDE omxtypes_s.h
34        INCLUDE armCOMM_s.h
35
36;// Import/Export symbols required from/to other files
37;// (For example tables)
38
39        IMPORT armVCM4P10_UnpackBlock4x4
40        IMPORT armVCM4P10_QPDivTable
41        IMPORT armVCM4P10_VMatrixQPModTable
42
43        M_VARIANTS ARM1136JS
44
45;// Set debugging level
46;//DEBUG_ON    SETL {TRUE}
47
48
49;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
50
51
52;// Guarding implementation by the processor name
53
54    IF  ARM1136JS
55
56
57;//Input Registers
58pData               RN  0
59QP                  RN  1
60
61;//Output Registers
62
63
64;//Local Scratch Registers
65
66;// Packed Input pixels
67in00                RN  2                   ;// Src[0] & Src[1]
68in02                RN  3                   ;// Src[2] & Src[3]
69in10                RN  4                   ;// Src[4] & Src[5]
70in12                RN  5                   ;// Src[6] & Src[7]
71in20                RN  6                   ;// Src[8] & Src[9]
72in22                RN  7                   ;// Src[10] & Src[11]
73in30                RN  8                   ;// Src[12] & Src[13]
74in32                RN  9                   ;// Src[14] & Src[15]
75
76;// Transpose for Row operations (Rows to cols)
77trRow00             RN  2
78trRow10             RN  10
79trRow02             RN  3
80trRow12             RN  5
81trRow20             RN  11
82trRow30             RN  12
83trRow32             RN  14
84trRow22             RN  7
85
86;// Intermediate calculations
87rowSum1             RN  4
88rowSum2             RN  6
89rowDiff1            RN  8
90rowDiff2            RN  9
91
92
93;// Row operated pixels
94rowOp00             RN  2
95rowOp10             RN  10
96rowOp20             RN  11
97rowOp30             RN  12
98rowOp02             RN  3
99rowOp12             RN  5
100rowOp22             RN  7
101rowOp32             RN  14
102
103;// Transpose for colulmn operations
104trCol00             RN  2
105trCol02             RN  3
106trCol10             RN  4
107trCol12             RN  5
108trCol20             RN  6
109trCol22             RN  7
110trCol30             RN  8
111trCol32             RN  9
112
113;// Intermediate calculations
114colSum1             RN  10
115colSum2             RN  11
116colDiff1            RN  12
117colDiff2            RN  14
118
119
120;// Coloumn operated pixels
121colOp00             RN  2
122colOp02             RN  3
123colOp10             RN  4
124colOp12             RN  5
125colOp20             RN  6
126colOp22             RN  7
127colOp30             RN  8
128colOp32             RN  9
129
130;// Temporary scratch varaibles
131pQPDivTable         RN  0
132pQPModTable         RN  11
133Shift               RN  10
134Scale               RN  14
135Round               RN  0
136
137temp1               RN  10
138temp2                RN  11
139temp3               RN  12
140temp4               RN  1
141
142
143
144;// InvTransformed and Dequantized pixels
145out00               RN  2
146out02               RN  3
147out10               RN  4
148out12               RN  5
149out20               RN  6
150out22               RN  7
151out30               RN  8
152out32               RN  9
153
154
155
156
157    ;// Allocate stack memory required by the function
158        M_ALLOC4    pDataOnStack, 4
159
160    ;// Write function header
161        M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11
162
163        ;******************************************************************
164        ;// The strategy used in implementing the transform is as follows:*
165        ;// Load the 4x4 block into 8 registers                           *
166        ;// Transpose the 4x4 matrix                                      *
167        ;// Perform the row operations (on columns) using SIMD            *
168        ;// Transpose the 4x4 result matrix                               *
169        ;// Perform the coloumn operations                                *
170        ;// Store the 4x4 block at one go                                 *
171        ;******************************************************************
172
173        ;// Load all the 4x4 pixels
174
175        LDMIA   pData,{in00,in02,in10,in12,in20,in22,in30,in32}
176
177        ;//*****************************************************************
178        ;//
179        ;// Transpose the matrix inorder to perform row ops as coloumn ops
180        ;// Input:   in[][] = original matrix
181        ;// Output:  trRow[][]= transposed matrix
182        ;// Step1: Obtain the LL part of the transposed matrix
183        ;// Step2: Obtain the HL part
184        ;// step3: Obtain the LH part
185        ;// Step4: Obtain the HH part
186        ;//
187        ;//*****************************************************************
188
189        ;// LL 2x2 transposed matrix
190        ;//   d0 d1 - -
191        ;//   d4 d5 - -
192        ;//   -  -  - -
193        ;//   -  -  - -
194
195        PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]
196        PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]
197
198        ;// HL 2x2 transposed matrix
199        ;//    -   -   - -
200        ;//    -   -   - -
201        ;//    d8  d9  - -
202        ;//   d12 d13  - -
203
204
205         PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
206         PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2]
207
208        ;// LH 2x2 transposed matrix
209        ;//   - - d2 d3
210        ;//   - - d6 d7
211        ;//   - - -  -
212        ;//   - - -  -
213
214        PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]
215        PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9]
216
217
218
219
220        ;// HH 2x2 transposed matrix
221        ;//    - -   -   -
222        ;//    - -   -   -
223        ;//    - -  d10 d11
224        ;//    - -  d14 d15
225
226        PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
227        PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]
228
229
230        ;****************************************
231        ;// Row Operations (Performed on columns)
232        ;****************************************
233
234
235        ;// SIMD operations on first two columns(two rows of the original matrix)
236
237        SADD16      rowSum1,trRow00,trRow10                ;// (c0+c1)
238        SADD16      rowSum2,trRow20,trRow30                ;// (c2+c3)
239        SSUB16      rowDiff1,trRow00,trRow10               ;// (c0-c1)
240        SSUB16      rowDiff2,trRow20,trRow30               ;// (c2-c3)
241        SADD16      rowOp00,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
242        SSUB16      rowOp10,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
243        SSUB16      rowOp20,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
244        SADD16      rowOp30,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)
245
246
247        ;// SIMD operations on next two columns(next two rows of the original matrix)
248
249        SADD16      rowSum1,trRow02,trRow12                ;// (c0+c1)
250        SADD16      rowSum2,trRow22,trRow32                ;// (c2+c3)
251        SSUB16      rowDiff1,trRow02,trRow12               ;// (c0-c1)
252        SSUB16      rowDiff2,trRow22,trRow32               ;// (c2-c3)
253        SADD16      rowOp02,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
254        SSUB16      rowOp12,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
255        SSUB16      rowOp22,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
256        SADD16      rowOp32,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)
257
258
259
260        ;*****************************************************************
261        ;// Transpose the resultant matrix
262        ;// Input:  rowOp[][]
263        ;// Output: trCol[][]
264        ;*****************************************************************
265
266        ;// LL 2x2 transposed matrix
267        ;//   d0 d1 - -
268        ;//   d4 d5 - -
269        ;//   -  -  - -
270        ;//   -  -  - -
271
272        PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
273        PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]
274
275        ;// HL 2x2 transposed matrix
276        ;//    -   -   - -
277        ;//    -   -   - -
278        ;//    d8  d9  - -
279        ;//   d12 d13  - -
280
281
282         PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
283         PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2]
284
285        ;// LH 2x2 transposed matrix
286        ;//   - - d2 d3
287        ;//   - - d6 d7
288        ;//   - - -  -
289        ;//   - - -  -
290
291        PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]
292        PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9]
293
294
295
296
297        ;// HH 2x2 transposed matrix
298        ;//    - -   -   -
299        ;//    - -   -   -
300        ;//    - -  d10 d11
301        ;//    - -  d14 d15
302
303        PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
304        PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]
305
306
307        ;*******************************
308        ;// Coloumn Operations
309        ;*******************************
310
311        ;//--------------------------------------------------------------------------------------
312        ;// Store pData(RN0) on stack and restore it only at the final store back
313        ;// This frees up a register (RN0) which is used to reduce number of intermediate stalls
314        ;//--------------------------------------------------------------------------------------
315        M_STR       pData,pDataOnStack
316
317
318        ;// SIMD operations on first two columns(two rows of the original matrix)
319
320        SADD16      colSum1,trCol00,trCol10                ;// (c0+c1)
321        SADD16      colSum2,trCol20,trCol30                ;// (c2+c3)
322        SSUB16      colDiff1,trCol00,trCol10               ;// (c0-c1)
323        SSUB16      colDiff2,trCol20,trCol30               ;// (c2-c3)
324        SADD16      colOp00,colSum1,colSum2                ;// (c0+c1+c2+c3)
325        SSUB16      colOp10,colSum1,colSum2                ;// (c0+c1-c2-c3)
326        SSUB16      colOp20,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
327        SADD16      colOp30,colDiff1,colDiff2              ;// (c0-c1+c2-c3)
328
329
330        ;// SIMD operations on next two columns(next two rows of the original matrix)
331
332        LDR         pQPDivTable, =armVCM4P10_QPDivTable    ;// QP Division look-up-table base pointer
333        SADD16      colSum1,trCol02,trCol12                ;// (c0+c1)
334        SADD16      colSum2,trCol22,trCol32                ;// (c2+c3)
335        SSUB16      colDiff1,trCol02,trCol12               ;// (c0-c1)
336        SSUB16      colDiff2,trCol22,trCol32               ;// (c2-c3)
337        SADD16      colOp02,colSum1,colSum2                ;// (c0+c1+c2+c3)
338        SSUB16      colOp12,colSum1,colSum2                ;// (c0+c1-c2-c3)
339        LDR         pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
340        LDRSB       Shift, [pQPDivTable, QP]               ;// Shift = pQPDivTable[QP]
341        SSUB16      colOp22,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
342        SADD16      colOp32,colDiff1,colDiff2              ;// (c0-c1+c2-c3)
343
344
345        LDRSB       Scale, [pQPModTable, QP]               ;// Scale = pQPModTable[QP]
346
347        ;//----------------------------------------------------------------------
348        ;//
349        ;// <Dequantize> improves on the c-reference code
350        ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
351        ;// We do not subtract 2 from Shift as in C reference, instead perform a
352        ;// Scale << Shift once in the beginning and do a right shift by a
353        ;// constant 2 after the Multiplication. The value of Round would be 2
354        ;//
355        ;// By doing this we aviod the Branches required and also
356        ;// reduce the code size substantially
357        ;//
358        ;//----------------------------------------------------------------------
359
360        MOV         Round, #2                               ;// Round = 2
361        LSL         Scale, Scale, Shift                     ;// Scale = Scale << Shift
362
363
364        ;// Row 1
365        SMLABB  temp1, colOp00, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
366        SMLABB  temp3, colOp02, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
367        SMLATB  temp2, colOp00, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
368        SMLATB  temp4, colOp02, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
369
370        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
371        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
372        PKHBT   out00,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
373        PKHBT   out02,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
374
375
376        ;// Row 2
377        SMLABB  temp1, colOp10, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
378        SMLABB  temp3, colOp12, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
379        SMLATB  temp2, colOp10, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
380        SMLATB  temp4, colOp12, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
381
382        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
383        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
384        PKHBT   out10,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
385        PKHBT   out12,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
386
387        ;// Row 3
388        SMLABB  temp1, colOp20, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
389        SMLABB  temp3, colOp22, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
390        SMLATB  temp2, colOp20, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
391        SMLATB  temp4, colOp22, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
392
393        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
394        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
395        PKHBT   out20,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
396        PKHBT   out22,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
397
398        ;// Row 4
399        SMLABB  temp1, colOp30, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
400        SMLABB  temp3, colOp32, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
401        SMLATB  temp2, colOp30, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
402        SMLATB  temp4, colOp32, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
403
404        M_LDR   pData,pDataOnStack                          ;// Restore pData pointer from stack
405        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
406        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
407        PKHBT   out30,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
408        PKHBT   out32,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
409
410
411
412        ;***************************
413        ;// Store all the 4x4 pixels
414        ;***************************
415
416store_coeff
417
418        STMIA   pData,{out00,out02,out10,out12,out20,out22,out30,out32}
419
420
421
422        ;// Set return value
423
424
425        ;// Write function tail
426        M_END
427
428    ENDIF                                                           ;//ARM1136JS
429
430
431;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
432
433;// Guarding implementation by the processor name
434
435
436
437
438;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
439
440;//Input Registers
441ppSrc               RN  0
442pDst                RN  1
443QPR2                RN  2
444
445;//Output Registers
446result              RN  0
447
448;//Local Scratch Registers
449pDstR4              RN  4
450pDstR0              RN  0
451QPR1                RN  1
452QPR5                RN  5
453
454;// Guarding implementation by the processor name
455
456    IF ARM1136JS
457
458    ;// Allocate stack memory required by the function
459
460
461    ;// Write function header
462        M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
463
464        MOV     pDstR4,pDst                         ;// Saving register r1
465        MOV     QPR5,QPR2                           ;// Saving register r2
466        BL      armVCM4P10_UnpackBlock4x4
467
468        MOV     pDstR0,pDstR4                       ;// Setting up register r0
469        MOV     QPR1,QPR5                           ;// Setting up register r1
470        BL      armVCM4P10_InvTransformDequantLumaDC4x4
471
472
473        ;// Set return value
474        MOV     result,#OMX_Sts_NoErr
475
476        ;// Write function tail
477        M_END
478
479
480    ENDIF                                                           ;//ARM1136JS
481
482
483    END
484