omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// Description:
19;// H.264 inverse quantize and transform module
20;//
21;//
22
23
24
25;// Include standard headers
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30;// Import symbols required from other files
31;// (For example tables)
32
33        IMPORT armVCM4P10_UnpackBlock4x4
34        IMPORT armVCM4P10_TransformResidual4x4
35        IMPORT armVCM4P10_QPDivTable
36        IMPORT armVCM4P10_VMatrixU16
37        IMPORT armVCM4P10_QPModuloTable
38
39    M_VARIANTS ARM1136JS, ARM1136JS_U
40
41;// Set debugging level
42;//DEBUG_ON    SETL {TRUE}
43
44
45;// Static Function: armVCM4P10_DequantLumaAC4x4
46
47;// Guarding implementation by the processor name
48
49    IF  ARM1136JS
50
51;//Input Registers
52pSrcDst       RN  0
53QP            RN  1
54
55
56;//Output Registers
57
58
59;//Local Scratch Registers
60pQPdiv          RN  4
61pQPmod          RN  5
62pVRow           RN  2
63QPmod           RN  6
64shift           RN  3
65rowLuma01       RN  1
66rowLuma23       RN  4
67
68SrcDst00        RN  5
69SrcDst02        RN  6
70SrcDst10        RN  7
71SrcDst12        RN  8
72SrcDst20        RN  9
73SrcDst22        RN  10
74SrcDst30        RN  11
75SrcDst32        RN  12
76
77temp1           RN  2
78temp2           RN  3
79temp3           RN  14
80
81
82        ;// Allocate stack memory required by the function
83
84        ;// Write function header
85        M_START armVCM4P10_DequantLumaAC4x4,r11
86
87        LDR    pQPmod,=armVCM4P10_QPModuloTable
88        LDR    pQPdiv,=armVCM4P10_QPDivTable
89        LDR    pVRow,=armVCM4P10_VMatrixU16
90
91        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
92        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
93
94        LDRH    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [00|0a]
95        LDRH    temp3,[pVRow,#2]                     ;// temp3     = [00|0b]
96        LDRH    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [00|0c]
97        ORR     rowLuma01,rowLuma01,temp3,LSL #16    ;// rowLuma01 = [0b|0a]
98
99        ;// Load all the 16 'src' values
100        LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
101
102
103        ;//*********************************************************************************************
104        ;//
105        ;// 'Shift' ranges between [0,8]
106        ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
107        ;//
108        ;//*********************************************************************************************
109
110        LSL    rowLuma01,rowLuma01,shift
111        LSL    rowLuma23,rowLuma23,shift
112
113
114        ;//**********************************************************************************************
115        ;//
116        ;// The idea is to unroll the Loop completely
117        ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
118        ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
119        ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
120        ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
121        ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
122        ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
123        ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
124        ;//
125        ;// We then pack the two 16 bit multiplication result into a word and store at one go
126        ;//
127        ;//**********************************************************************************************
128
129
130        ;// Row 1
131
132
133        SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift)
134        SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)
135
136        SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift)
137        SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
138
139        PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
140
141
142        ;// Row 2
143        SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
144        SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
145
146        PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
147        SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
148        SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
149
150        PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
151
152
153        ;// Row 3
154
155        SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)
156        SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)
157
158        PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
159        SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift)
160        SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
161
162        PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
163
164
165
166        ;// Row 4
167
168        SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
169        SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
170
171        SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
172        SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
173
174        PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
175        PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
176        PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
177
178
179        STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
180
181
182        ;// Set return value
183
184
185
186        ;// Write function tail
187        M_END
188
189    ENDIF                                                    ;//ARM1136JS
190
191
192;// Guarding implementation by the processor name
193
194    IF  ARM1136JS_U
195
196;//Input Registers
197pSrcDst       RN  0
198QP            RN  1
199
200
201;//Output Registers
202
203
204;//Local Scratch Registers
205pQPdiv          RN  4
206pQPmod          RN  5
207pVRow           RN  2
208QPmod           RN  6
209shift           RN  3
210rowLuma01       RN  1
211rowLuma23       RN  4
212
213SrcDst00        RN  5
214SrcDst02        RN  6
215SrcDst10        RN  7
216SrcDst12        RN  8
217SrcDst20        RN  9
218SrcDst22        RN  10
219SrcDst30        RN  11
220SrcDst32        RN  12
221
222temp1           RN  2
223temp2           RN  3
224temp3           RN  14
225
226
227        ;// Allocate stack memory required by the function
228
229        ;// Write function header
230        M_START armVCM4P10_DequantLumaAC4x4,r11
231
232        LDR    pQPmod,=armVCM4P10_QPModuloTable
233        LDR    pQPdiv,=armVCM4P10_QPDivTable
234        LDR    pVRow,=armVCM4P10_VMatrixU16
235
236        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
237        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
238
239        LDR    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [0b|0a]
240        LDR    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [0d|0c]
241
242        ;// Load all the 16 'src' values
243        LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
244
245
246        ;//*********************************************************************************************
247        ;//
248        ;// 'Shift' ranges between [0,8]
249        ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
250        ;//
251        ;//*********************************************************************************************
252
253        LSL    rowLuma01,rowLuma01,shift
254        LSL    rowLuma23,rowLuma23,shift
255
256
257        ;//**********************************************************************************************
258        ;//
259        ;// The idea is to unroll the Loop completely
260        ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
261        ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
262        ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
263        ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
264        ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
265        ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
266        ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
267        ;//
268        ;// We then pack the two 16 bit multiplication result into a word and store at one go
269        ;//
270        ;//**********************************************************************************************
271
272
273        ;// Row 1
274
275
276        SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift)
277        SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)
278
279        SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift)
280        SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
281
282        PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
283
284
285        ;// Row 2
286        SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
287        SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
288
289        PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
290        SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
291        SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
292
293        PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
294
295
296        ;// Row 3
297
298        SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)
299        SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)
300
301        PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
302        SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift)
303        SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
304
305        PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
306
307
308
309        ;// Row 4
310
311        SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
312        SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
313
314        SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
315        SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
316
317        PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
318        PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
319        PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
320
321
322        STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
323
324
325        ;// Set return value
326
327
328
329        ;// Write function tail
330        M_END
331
332    ENDIF                                                    ;//ARM1136JS_U
333
334
335
336
337
338;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
339
340;// Guarding implementation by the processor name
341
342    IF  ARM1136JS
343
344;//Input Registers
345ppSrc       RN  0
346pPred       RN  1
347pDC         RN  2
348pDst        RN  3
349
350
351;//Output Registers
352result      RN  0
353
354;//Local Scratch Registers
355pDelta      RN  4
356pDeltaTmp   RN  6
357AC          RN  5                   ;//Load from stack
358pPredTemp   RN  7
359pDCTemp     RN  8
360pDstTemp    RN  9
361pDeltaArg1  RN  1
362pDeltaArg0  RN  0
363QP          RN  1                   ;//Load from stack
364DCval       RN  10
365DCvalCopy   RN  11
366predstep    RN  1
367dstStep     RN  10
368ycounter    RN  0
369PredVal1    RN  3
370PredVal2    RN  5
371DeltaVal1   RN  2
372DeltaVal2   RN  11
373PredVal     RN  8
374tmpDeltaVal RN  6
375sum1        RN  12
376sum2        RN  14
377
378
379
380    ;// Allocate stack memory required by the function
381        M_ALLOC8 pBuffer, 32
382
383
384    ;// Write function header
385        M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11
386
387        ;// Define stack arguments
388        M_ARG   predStepOnStack, 4
389        M_ARG   dstStepOnStack,4
390        M_ARG   QPOnStack, 4
391        M_ARG   ACOnStack,4
392
393
394        M_ADR   pDelta,pBuffer
395        M_LDR   AC,ACOnStack
396
397
398        ;// Save registers r1,r2,r3 before function call
399        MOV     pPredTemp,pPred
400        MOV     pDCTemp,pDC
401        MOV     pDstTemp,pDst
402
403        CMP     AC,#0
404        BEQ     DCcase
405        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4
406
407        BL      armVCM4P10_UnpackBlock4x4
408
409        M_LDR   QP,QPOnStack                                ;// Set up r1 for DequantLumaAC4x4
410        MOV     pDeltaArg0,pDelta                           ;// Set up r0 for DequantLumaAC4x4
411
412        BL      armVCM4P10_DequantLumaAC4x4
413
414
415        CMP     pDCTemp,#0
416        LDRSHNE DCval,[pDCTemp]
417        MOV     pDeltaArg0,pDelta                           ;// Set up r0 for armVCM4P10_TransformResidual4x4
418        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_TransformResidual4x4
419        STRHNE  DCval,[pDelta]
420
421        BL      armVCM4P10_TransformResidual4x4
422        B       OutDCcase
423
424
425DCcase
426        LDRSH   DCval,[pDCTemp]
427        ADD     DCval,DCval,#32
428        ASR     DCval,DCval,#6
429        PKHBT   DCval,DCval,DCval,LSL #16                  ;// Duplicating the Lower halfword
430        MOV     DCvalCopy, DCval                           ;// Needed for STRD
431        STRD    DCval, [pDelta, #0]                        ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
432        STRD    DCval, [pDelta, #8]                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
433        STRD    DCval, [pDelta, #16]                       ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
434        STRD    DCval, [pDelta, #24]
435
436
437OutDCcase
438        M_LDR   predstep,predStepOnStack
439        M_LDR   dstStep,dstStepOnStack
440
441        LDMIA   pDelta!,{tmpDeltaVal,DeltaVal2}             ;// Pre load
442        MOV     ycounter,#4                                 ;// Counter for the PredPlusDeltaLoop
443        LDR     PredVal,[pPredTemp]                         ;// Pre load
444
445PredPlusDeltaLoop
446
447
448        SUBS    ycounter,ycounter,#1
449        ADD     pPredTemp,pPredTemp,predstep                ;// Increment pPred ptr
450
451        PKHBT   DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16     ;// Deltaval1 = [C A]
452        PKHTB   DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16     ;// DeltaVal2 = [D B]
453
454        UXTB16  PredVal1,PredVal                            ;// PredVal1 = [0c0a]
455        UXTB16  PredVal2,PredVal,ROR #8                     ;// PredVal2 = [0d0b]
456
457        LDRGT   PredVal,[pPredTemp]                         ;// Pre load
458
459        QADD16  sum2,DeltaVal2,PredVal2                     ;// Add and saturate to 16 bits
460        QADD16  sum1,DeltaVal1,PredVal1
461
462        USAT16  sum2,#8,sum2                                ;// armClip(0,255,sum2)
463        USAT16  sum1,#8,sum1
464
465        LDMGTIA   pDelta!,{tmpDeltaVal,DeltaVal2}           ;// Pre load
466
467        ORR     sum1,sum1,sum2,LSL #8                       ;// sum1 = [dcba]
468        STR     sum1,[pDstTemp]
469
470        ADD     pDstTemp,pDstTemp,dstStep                   ;// Increment pDst ptr
471        BGT     PredPlusDeltaLoop
472
473
474        ;// Set return value
475        MOV     result,#OMX_Sts_NoErr
476
477End
478
479
480        ;// Write function tail
481
482        M_END
483
484    ENDIF                                                    ;//ARM1136JS
485
486
487;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
488
489;// Guarding implementation by the processor name
490
491
492
493
494    END
495