1;//
2;//
3;// File Name:  omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12;// Description:
13;// H.264 inverse quantize and transform module
14;//
15;//
16
17
18
19;// Include standard headers
20
21        INCLUDE omxtypes_s.h
22        INCLUDE armCOMM_s.h
23
24;// Import symbols required from other files
25;// (For example tables)
26
27        IMPORT armVCM4P10_UnpackBlock4x4
28        IMPORT armVCM4P10_TransformResidual4x4
29        IMPORT armVCM4P10_QPDivTable
30        IMPORT armVCM4P10_VMatrixU16
31        IMPORT armVCM4P10_QPModuloTable
32
33        M_VARIANTS CortexA8
34
35;// Set debugging level
36;//DEBUG_ON    SETL {TRUE}
37
38
39;// Static Function: armVCM4P10_DequantLumaAC4x4
40
41;// Guarding implementation by the processor name
42
43
44
45;// Guarding implementation by the processor name
46
47
48
49
50
51
52;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
53
54;// Guarding implementation by the processor name
55
56
57
58;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
59
60;// Guarding implementation by the processor name
61
62    IF  CortexA8
63
64
65;// ARM Registers
66
67;//Input Registers
68ppSrc       RN  0
69pPred       RN  1
70pDC         RN  2
71pDst        RN  3
72
73
74;//Output Registers
75result      RN  0
76
77;//Local Scratch Registers
78
79;//Registers used in armVCM4P10_DequantLumaAC4x4
80pQPdiv      RN  10
81pQPmod      RN  11
82pVRow       RN  2
83QPmod       RN  12
84shift       RN  14
85index0      RN  1
86index1      RN  10
87
88;//Registers used in DequantTransformResidualFromPairAndAdd
89pDelta      RN  4
90pDeltaTmp   RN  6
91AC          RN  5                   ;//Load from stack
92pPredTemp   RN  7
93pDCTemp     RN  8
94pDstTemp    RN  9
95pDeltaArg1  RN  1
96pDeltaArg0  RN  0
97QP          RN  1                   ;//Load from stack
98DCval       RN  10
99predstep    RN  1
100dstStep     RN  10
101PredVal1    RN  3
102PredVal2    RN  5
103
104
105
106
107;// Neon Registers
108
109;// Registers used in armVCM4P10_DequantLumaAC4x4
110
111dVmatrix            DN  D6.8
112dindexRow0          DN  D7.32
113dindexRow1          DN  D9.32
114dByteIndexRow0      DN  D7.8
115dByteIndexRow1      DN  D9.8
116dVRow0              DN  D8.8
117dVRow1              DN  D4.8
118dVRow0U16           DN  D8.U16
119dVRow1U16           DN  D4.U16
120dVRow2U16           DN  D8.U16
121dVRow3U16           DN  D4.U16
122
123dShift              DN  D5.U16
124dSrcRow0            DN  D0.I16
125dSrcRow1            DN  D1.I16
126dSrcRow2            DN  D2.I16
127dSrcRow3            DN  D3.I16
128dDqntRow0           DN  D0.I16
129dDqntRow1           DN  D1.I16
130dDqntRow2           DN  D2.I16
131dDqntRow3           DN  D3.I16
132
133;// Registers used in TransformResidual4x4
134
135;// Packed Input pixels
136dIn0                DN  D0.S16
137dIn1                DN  D1.S16
138dIn2                DN  D2.S16
139dIn3                DN  D3.S16
140qIn01               QN  Q0.32
141qIn23               QN  Q1.32
142
143;// Intermediate calculations
144dZero               DN  D4.S16
145de0                 DN  D5.S16
146de1                 DN  D6.S16
147de2                 DN  D7.S16
148de3                 DN  D8.S16
149dIn1RS              DN  D7.S16
150dIn3RS              DN  D8.S16
151df0                 DN  D0.S16
152df1                 DN  D1.S16
153df2                 DN  D2.S16
154df3                 DN  D3.S16
155qf01                QN  Q0.32
156qf23                QN  Q1.32
157dg0                 DN  D5.S16
158dg1                 DN  D6.S16
159dg2                 DN  D7.S16
160dg3                 DN  D8.S16
161df1RS               DN  D7.S16
162df3RS               DN  D8.S16
163
164;// Output pixels
165dh0                 DN  D0.S16
166dh1                 DN  D1.S16
167dh2                 DN  D2.S16
168dh3                 DN  D3.S16
169
170;// Registers used in DequantTransformResidualFromPairAndAdd
171
172dDeltaRow0          DN  D0.S16
173dDeltaRow1          DN  D1.S16
174dDeltaRow2          DN  D2.S16
175dDeltaRow3          DN  D3.S16
176qDeltaRow01         QN  Q0.S16
177qDeltaRow23         QN  Q1.S16
178
179dPredValRow01       DN  D4.U8
180dPredValRow23       DN  D5.U8
181
182qSumRow01           QN  Q3.S16
183qSumRow23           QN  Q4.S16
184dDstRow01           DN  D0.U8
185dDstRow23           DN  D1.U8
186dDstRow0            DN  D0.32[0]
187dDstRow1            DN  D0.32[1]
188dDstRow2            DN  D1.32[0]
189dDstRow3            DN  D1.32[1]
190
191
192    ;// Allocate stack memory required by the function
193        M_ALLOC8 pBuffer, 32
194
195
196    ;// Write function header
197        M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9
198
199        ;// Define stack arguments
200        M_ARG   predStepOnStack, 4
201        M_ARG   dstStepOnStack,4
202        M_ARG   QPOnStack, 4
203        M_ARG   ACOnStack,4
204
205
206        M_ADR   pDelta,pBuffer
207        M_LDR   AC,ACOnStack
208
209
210        ;// Save registers r1,r2,r3 before function call
211        MOV     pPredTemp,pPred
212        MOV     pDCTemp,pDC
213        MOV     pDstTemp,pDst
214
215        CMP     AC,#0
216        BEQ     DCcase
217        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4
218
219        BL      armVCM4P10_UnpackBlock4x4
220
221        ;//--------------------------------------------------------
222        ;// armVCM4P10_DequantLumaAC4x4 : static function inlined
223        ;//--------------------------------------------------------
224
225        ;//BL      armVCM4P10_DequantLumaAC4x4
226        M_LDR   QP,QPOnStack                                ;// Set up r1 for armVCM4P10_DequantLumaAC4x4
227
228        LDR    pQPmod,=armVCM4P10_QPModuloTable
229        LDR    pQPdiv,=armVCM4P10_QPDivTable
230        LDR    pVRow,=armVCM4P10_VMatrixU16
231
232
233        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
234        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
235
236        LDR    index1,=0x03020504
237        LDR    index0,=0x05040100                   ;// Indexes into dVmatrix
238        ADD    pVRow,pVRow,QPmod
239        VDUP   dindexRow0,index0
240        VDUP   dindexRow1,index1
241        VDUP   dShift,shift
242
243        ;// Load all 4x4 pVRow[] values
244        VLD1   dVmatrix,[pVRow]                     ;// dVmatrix = [0d|0c|0b|0a]
245
246
247        VTBL   dVRow0,dVmatrix,dByteIndexRow0       ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]]
248        VTBL   dVRow1,dVmatrix,dByteIndexRow1       ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]]
249        CMP     pDCTemp,#0
250        ;// Load all the 4x4 'src' values
251        VLD1   { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta]
252
253        VSHL   dVRow0U16,dVRow0U16,dShift
254        VSHL   dVRow1U16,dVRow1U16,dShift
255        LDRSHNE DCval,[pDCTemp]
256
257
258        ;// Multiply src[] with pVRow[]
259        VMUL    dDqntRow0,dSrcRow0,dVRow0U16
260        VMUL    dDqntRow1,dSrcRow1,dVRow1U16
261        VMUL    dDqntRow2,dSrcRow2,dVRow2U16
262        VMUL    dDqntRow3,dSrcRow3,dVRow3U16
263
264
265
266        ;//-------------------------------------------------------------
267        ;// TransformResidual4x4 : Inlined to avoid Load/Stores
268        ;//-------------------------------------------------------------
269
270
271        ;//BL      armVCM4P10_TransformResidual4x4
272        ;//STRHNE  DCval,[pDelta]
273        VMOVNE    dIn0[0],DCval
274
275
276
277        ;//*****************************************************************
278        ;// Transpose the input pixels : perform Row ops as Col ops
279        ;//*****************************************************************
280
281        VTRN    dIn0,dIn1
282        VTRN    dIn2,dIn3
283        VTRN    qIn01,qIn23
284
285
286        VMOV    dZero,#0                                    ;// Used to right shift by 1
287
288
289        ;//****************************************
290        ;// Row Operations (Performed on columns)
291        ;//****************************************
292
293
294        VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2
295        VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2
296        VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
297        VHADD       dIn3RS,dIn3,dZero
298        VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3
299        VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1)
300        VADD        df0,de0,de3                         ;//  f0 = e0 + e3
301        VADD        df1,de1,de2                            ;//  f1 = e1 + e2
302        VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
303        VSUB        df3,de0,de3                            ;//  f3 = e0 - e3
304
305
306
307        ;//*****************************************************************
308        ;// Transpose the resultant matrix
309        ;//*****************************************************************
310
311        VTRN    df0,df1
312        VTRN    df2,df3
313        VTRN    qf01,qf23
314
315
316        ;//*******************************
317        ;// Coloumn Operations
318        ;//*******************************
319
320
321        VADD        dg0,df0,df2                         ;//  e0 = d0 + d2
322        VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2
323        VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
324        VHADD       df3RS,df3,dZero
325        VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3
326        VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1)
327        VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
328        VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
329        VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
330        VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3
331
332
333        ;//************************************************
334        ;// Calculate final value (colOp[i][j] + 32)>>6
335        ;//************************************************
336
337        VRSHR       dh0,#6
338        VRSHR       dh1,#6
339        VRSHR       dh2,#6
340        VRSHR       dh3,#6
341
342
343        B       OutDCcase
344
345
346DCcase
347        ;// Calculate the Transformed DCvalue : (DCval+32)>>6
348        LDRSH   DCval,[pDCTemp]
349        ADD     DCval,DCval,#32
350        ASR     DCval,DCval,#6
351
352        VDUP    dDeltaRow0, DCval                       ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
353        VDUP    dDeltaRow1, DCval                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
354        VDUP    dDeltaRow2, DCval                        ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
355        VDUP    dDeltaRow3, DCval
356
357
358OutDCcase
359        M_LDR   predstep,predStepOnStack
360        M_LDR   dstStep,dstStepOnStack
361
362        LDR     PredVal1,[pPredTemp],predstep
363        LDR     PredVal2,[pPredTemp],predstep
364        VMOV    dPredValRow01,PredVal1,PredVal2
365
366        LDR     PredVal1,[pPredTemp],predstep
367        LDR     PredVal2,[pPredTemp]
368        VMOV    dPredValRow23,PredVal1,PredVal2
369
370
371        VADDW   qSumRow01,qDeltaRow01,dPredValRow01
372        VADDW   qSumRow23,qDeltaRow23,dPredValRow23
373        VQMOVUN dDstRow01,qSumRow01
374        VQMOVUN dDstRow23,qSumRow23
375
376
377        VST1    dDstRow0,[pDstTemp],dstStep
378        VST1    dDstRow1,[pDstTemp],dstStep
379        VST1    dDstRow2,[pDstTemp],dstStep
380        VST1    dDstRow3,[pDstTemp]
381
382        ;// Set return value
383        MOV     result,#OMX_Sts_NoErr
384
385End
386
387
388        ;// Write function tail
389
390        M_END
391
392    ENDIF                                                    ;//CORTEXA8
393
394
395
396    END
397