1;//
2;//
3;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   9641
6;// Date:       Thursday, February 7, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12;// Description:
13;// H.264 inverse quantize and transform module
14;//
15;//
16
17;// Include standard headers
18
19        INCLUDE omxtypes_s.h
20        INCLUDE armCOMM_s.h
21
22;// Import/Export symbols required from/to other files
23;// (For example tables)
24
25        IMPORT armVCM4P10_UnpackBlock4x4
26        IMPORT armVCM4P10_QPDivTable
27        IMPORT armVCM4P10_VMatrixQPModTable
28
29        M_VARIANTS ARM1136JS
30
31;// Set debugging level
32;//DEBUG_ON    SETL {TRUE}
33
34
35;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
36
37
38;// Guarding implementation by the processor name
39
40    IF  ARM1136JS
41
42
43;//Input Registers
44pData               RN  0
45QP                  RN  1
46
47;//Output Registers
48
49
50;//Local Scratch Registers
51
52;// Packed Input pixels
53in00                RN  2                   ;// Src[0] & Src[1]
54in02                RN  3                   ;// Src[2] & Src[3]
55in10                RN  4                   ;// Src[4] & Src[5]
56in12                RN  5                   ;// Src[6] & Src[7]
57in20                RN  6                   ;// Src[8] & Src[9]
58in22                RN  7                   ;// Src[10] & Src[11]
59in30                RN  8                   ;// Src[12] & Src[13]
60in32                RN  9                   ;// Src[14] & Src[15]
61
62;// Transpose for Row operations (Rows to cols)
63trRow00             RN  2
64trRow10             RN  10
65trRow02             RN  3
66trRow12             RN  5
67trRow20             RN  11
68trRow30             RN  12
69trRow32             RN  14
70trRow22             RN  7
71
72;// Intermediate calculations
73rowSum1             RN  4
74rowSum2             RN  6
75rowDiff1            RN  8
76rowDiff2            RN  9
77
78
79;// Row operated pixels
80rowOp00             RN  2
81rowOp10             RN  10
82rowOp20             RN  11
83rowOp30             RN  12
84rowOp02             RN  3
85rowOp12             RN  5
86rowOp22             RN  7
87rowOp32             RN  14
88
89;// Transpose for colulmn operations
90trCol00             RN  2
91trCol02             RN  3
92trCol10             RN  4
93trCol12             RN  5
94trCol20             RN  6
95trCol22             RN  7
96trCol30             RN  8
97trCol32             RN  9
98
99;// Intermediate calculations
100colSum1             RN  10
101colSum2             RN  11
102colDiff1            RN  12
103colDiff2            RN  14
104
105
106;// Coloumn operated pixels
107colOp00             RN  2
108colOp02             RN  3
109colOp10             RN  4
110colOp12             RN  5
111colOp20             RN  6
112colOp22             RN  7
113colOp30             RN  8
114colOp32             RN  9
115
116;// Temporary scratch varaibles
117pQPDivTable         RN  0
118pQPModTable         RN  11
119Shift               RN  10
120Scale               RN  14
121Round               RN  0
122
123temp1               RN  10
124temp2                RN  11
125temp3               RN  12
126temp4               RN  1
127
128
129
130;// InvTransformed and Dequantized pixels
131out00               RN  2
132out02               RN  3
133out10               RN  4
134out12               RN  5
135out20               RN  6
136out22               RN  7
137out30               RN  8
138out32               RN  9
139
140
141
142
143    ;// Allocate stack memory required by the function
144        M_ALLOC4    pDataOnStack, 4
145
146    ;// Write function header
147        M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11
148
149        ;******************************************************************
150        ;// The strategy used in implementing the transform is as follows:*
151        ;// Load the 4x4 block into 8 registers                           *
152        ;// Transpose the 4x4 matrix                                      *
153        ;// Perform the row operations (on columns) using SIMD            *
154        ;// Transpose the 4x4 result matrix                               *
155        ;// Perform the coloumn operations                                *
156        ;// Store the 4x4 block at one go                                 *
157        ;******************************************************************
158
159        ;// Load all the 4x4 pixels
160
161        LDMIA   pData,{in00,in02,in10,in12,in20,in22,in30,in32}
162
163        ;//*****************************************************************
164        ;//
165        ;// Transpose the matrix inorder to perform row ops as coloumn ops
166        ;// Input:   in[][] = original matrix
167        ;// Output:  trRow[][]= transposed matrix
168        ;// Step1: Obtain the LL part of the transposed matrix
169        ;// Step2: Obtain the HL part
170        ;// step3: Obtain the LH part
171        ;// Step4: Obtain the HH part
172        ;//
173        ;//*****************************************************************
174
175        ;// LL 2x2 transposed matrix
176        ;//   d0 d1 - -
177        ;//   d4 d5 - -
178        ;//   -  -  - -
179        ;//   -  -  - -
180
181        PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]
182        PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]
183
184        ;// HL 2x2 transposed matrix
185        ;//    -   -   - -
186        ;//    -   -   - -
187        ;//    d8  d9  - -
188        ;//   d12 d13  - -
189
190
191         PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
192         PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2]
193
194        ;// LH 2x2 transposed matrix
195        ;//   - - d2 d3
196        ;//   - - d6 d7
197        ;//   - - -  -
198        ;//   - - -  -
199
200        PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]
201        PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9]
202
203
204
205
206        ;// HH 2x2 transposed matrix
207        ;//    - -   -   -
208        ;//    - -   -   -
209        ;//    - -  d10 d11
210        ;//    - -  d14 d15
211
212        PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
213        PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]
214
215
216        ;****************************************
217        ;// Row Operations (Performed on columns)
218        ;****************************************
219
220
221        ;// SIMD operations on first two columns(two rows of the original matrix)
222
223        SADD16      rowSum1,trRow00,trRow10                ;// (c0+c1)
224        SADD16      rowSum2,trRow20,trRow30                ;// (c2+c3)
225        SSUB16      rowDiff1,trRow00,trRow10               ;// (c0-c1)
226        SSUB16      rowDiff2,trRow20,trRow30               ;// (c2-c3)
227        SADD16      rowOp00,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
228        SSUB16      rowOp10,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
229        SSUB16      rowOp20,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
230        SADD16      rowOp30,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)
231
232
233        ;// SIMD operations on next two columns(next two rows of the original matrix)
234
235        SADD16      rowSum1,trRow02,trRow12                ;// (c0+c1)
236        SADD16      rowSum2,trRow22,trRow32                ;// (c2+c3)
237        SSUB16      rowDiff1,trRow02,trRow12               ;// (c0-c1)
238        SSUB16      rowDiff2,trRow22,trRow32               ;// (c2-c3)
239        SADD16      rowOp02,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
240        SSUB16      rowOp12,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
241        SSUB16      rowOp22,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
242        SADD16      rowOp32,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)
243
244
245
246        ;*****************************************************************
247        ;// Transpose the resultant matrix
248        ;// Input:  rowOp[][]
249        ;// Output: trCol[][]
250        ;*****************************************************************
251
252        ;// LL 2x2 transposed matrix
253        ;//   d0 d1 - -
254        ;//   d4 d5 - -
255        ;//   -  -  - -
256        ;//   -  -  - -
257
258        PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
259        PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]
260
261        ;// HL 2x2 transposed matrix
262        ;//    -   -   - -
263        ;//    -   -   - -
264        ;//    d8  d9  - -
265        ;//   d12 d13  - -
266
267
268         PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
269         PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2]
270
271        ;// LH 2x2 transposed matrix
272        ;//   - - d2 d3
273        ;//   - - d6 d7
274        ;//   - - -  -
275        ;//   - - -  -
276
277        PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]
278        PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9]
279
280
281
282
283        ;// HH 2x2 transposed matrix
284        ;//    - -   -   -
285        ;//    - -   -   -
286        ;//    - -  d10 d11
287        ;//    - -  d14 d15
288
289        PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
290        PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]
291
292
293        ;*******************************
294        ;// Coloumn Operations
295        ;*******************************
296
297        ;//--------------------------------------------------------------------------------------
298        ;// Store pData(RN0) on stack and restore it only at the final store back
299        ;// This frees up a register (RN0) which is used to reduce number of intermediate stalls
300        ;//--------------------------------------------------------------------------------------
301        M_STR       pData,pDataOnStack
302
303
304        ;// SIMD operations on first two columns(two rows of the original matrix)
305
306        SADD16      colSum1,trCol00,trCol10                ;// (c0+c1)
307        SADD16      colSum2,trCol20,trCol30                ;// (c2+c3)
308        SSUB16      colDiff1,trCol00,trCol10               ;// (c0-c1)
309        SSUB16      colDiff2,trCol20,trCol30               ;// (c2-c3)
310        SADD16      colOp00,colSum1,colSum2                ;// (c0+c1+c2+c3)
311        SSUB16      colOp10,colSum1,colSum2                ;// (c0+c1-c2-c3)
312        SSUB16      colOp20,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
313        SADD16      colOp30,colDiff1,colDiff2              ;// (c0-c1+c2-c3)
314
315
316        ;// SIMD operations on next two columns(next two rows of the original matrix)
317
318        LDR         pQPDivTable, =armVCM4P10_QPDivTable    ;// QP Division look-up-table base pointer
319        SADD16      colSum1,trCol02,trCol12                ;// (c0+c1)
320        SADD16      colSum2,trCol22,trCol32                ;// (c2+c3)
321        SSUB16      colDiff1,trCol02,trCol12               ;// (c0-c1)
322        SSUB16      colDiff2,trCol22,trCol32               ;// (c2-c3)
323        SADD16      colOp02,colSum1,colSum2                ;// (c0+c1+c2+c3)
324        SSUB16      colOp12,colSum1,colSum2                ;// (c0+c1-c2-c3)
325        LDR         pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
326        LDRSB       Shift, [pQPDivTable, QP]               ;// Shift = pQPDivTable[QP]
327        SSUB16      colOp22,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
328        SADD16      colOp32,colDiff1,colDiff2              ;// (c0-c1+c2-c3)
329
330
331        LDRSB       Scale, [pQPModTable, QP]               ;// Scale = pQPModTable[QP]
332
333        ;//----------------------------------------------------------------------
334        ;//
335        ;// <Dequantize> improves on the c-reference code
336        ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
337        ;// We do not subtract 2 from Shift as in C reference, instead perform a
338        ;// Scale << Shift once in the beginning and do a right shift by a
339        ;// constant 2 after the Multiplication. The value of Round would be 2
340        ;//
341        ;// By doing this we aviod the Branches required and also
342        ;// reduce the code size substantially
343        ;//
344        ;//----------------------------------------------------------------------
345
346        MOV         Round, #2                               ;// Round = 2
347        LSL         Scale, Scale, Shift                     ;// Scale = Scale << Shift
348
349
350        ;// Row 1
351        SMLABB  temp1, colOp00, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
352        SMLABB  temp3, colOp02, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
353        SMLATB  temp2, colOp00, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
354        SMLATB  temp4, colOp02, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
355
356        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
357        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
358        PKHBT   out00,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
359        PKHBT   out02,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
360
361
362        ;// Row 2
363        SMLABB  temp1, colOp10, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
364        SMLABB  temp3, colOp12, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
365        SMLATB  temp2, colOp10, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
366        SMLATB  temp4, colOp12, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
367
368        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
369        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
370        PKHBT   out10,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
371        PKHBT   out12,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
372
373        ;// Row 3
374        SMLABB  temp1, colOp20, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
375        SMLABB  temp3, colOp22, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
376        SMLATB  temp2, colOp20, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
377        SMLATB  temp4, colOp22, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
378
379        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
380        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
381        PKHBT   out20,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
382        PKHBT   out22,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
383
384        ;// Row 4
385        SMLABB  temp1, colOp30, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
386        SMLABB  temp3, colOp32, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
387        SMLATB  temp2, colOp30, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
388        SMLATB  temp4, colOp32, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
389
390        M_LDR   pData,pDataOnStack                          ;// Restore pData pointer from stack
391        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
392        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
393        PKHBT   out30,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
394        PKHBT   out32,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
395
396
397
398        ;***************************
399        ;// Store all the 4x4 pixels
400        ;***************************
401
402store_coeff
403
404        STMIA   pData,{out00,out02,out10,out12,out20,out22,out30,out32}
405
406
407
408        ;// Set return value
409
410
411        ;// Write function tail
412        M_END
413
414    ENDIF                                                           ;//ARM1136JS
415
416
417;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
418
419;// Guarding implementation by the processor name
420
421
422
423
424;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
425
426;//Input Registers
427ppSrc               RN  0
428pDst                RN  1
429QPR2                RN  2
430
431;//Output Registers
432result              RN  0
433
434;//Local Scratch Registers
435pDstR4              RN  4
436pDstR0              RN  0
437QPR1                RN  1
438QPR5                RN  5
439
440;// Guarding implementation by the processor name
441
442    IF ARM1136JS
443
444    ;// Allocate stack memory required by the function
445
446
447    ;// Write function header
448        M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
449
450        MOV     pDstR4,pDst                         ;// Saving register r1
451        MOV     QPR5,QPR2                           ;// Saving register r2
452        BL      armVCM4P10_UnpackBlock4x4
453
454        MOV     pDstR0,pDstR4                       ;// Setting up register r0
455        MOV     QPR1,QPR5                           ;// Setting up register r1
456        BL      armVCM4P10_InvTransformDequantLumaDC4x4
457
458
459        ;// Set return value
460        MOV     result,#OMX_Sts_NoErr
461
462        ;// Write function tail
463        M_END
464
465
466    ENDIF                                                           ;//ARM1136JS
467
468
469    END