omxVCM4P10_TransformDequantChromaDCFromPair_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;//
3;// File Name:  omxVCM4P10_TransformDequantChromaDCFromPair_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13
14        INCLUDE omxtypes_s.h
15        INCLUDE armCOMM_s.h
16
17        IMPORT armVCM4P10_QPDivTable
18        IMPORT armVCM4P10_VMatrixQPModTable
19
20        M_VARIANTS CortexA8
21
22
23
24
25    IF CortexA8
26
27;// ARM Registers
28;//--------------------------------------
29;// Declare input registers
30;//--------------------------------------
31ppSrc       RN 0
32pDst        RN 1
33QP          RN 2
34
35;//--------------------------------
36;// Scratch variable for Unpack2x2
37;//--------------------------------
38pSrc        RN 9
39Value       RN 4
40Value2      RN 5
41Flag        RN 6
42strOffset   RN 7
43cstOffset   RN 8
44
45;//--------------------------------
46;// Scratch variable
47;//--------------------------------
48r0w0        RN  3
49r0w1        RN  4
50
51c0w0        RN  5
52c1w0        RN  6
53
54return      RN  0
55pQPDivTable RN  5
56pQPModTable    RN  6
57Shift        RN  9
58Scale        RN  2
59
60
61
62;// Neon Registers
63
64dZero       DN  D0.U16
65dInvTrCoeff DN  D0.S16
66dScale      DN  D1.S16
67qDqntCoeff  QN  Q1.S32
68dDqntCoeff  DN  D2.S16
69
70
71        ;// Write function header
72        M_START omxVCM4P10_TransformDequantChromaDCFromPair, r9
73
74        LDR     pSrc, [ppSrc]                        ;// Load pSrc
75        VMOV    dZero, #0
76        MOV     cstOffset, #31                       ;// To be used in the loop, to compute offset
77
78        ;//-----------------------------------------------------------------------
79        ;// Firstly, fill all the coefficient values on the <pDst> buffer by zero
80        ;//-----------------------------------------------------------------------
81
82        VST1    dZero,[pDst]                         ;// pDst[0]  = pDst[1]  = pDst[2]  = pDst[3]  = 0
83        LDRB     Flag,  [pSrc], #1                   ;// Preload <Flag> before <unpackLoop>
84
85
86unpackLoop
87        TST      Flag,  #0x10                        ;// Computing (Flag & 0x10)
88        LDRSBNE  Value2,[pSrc,#1]
89        LDRBNE   Value, [pSrc], #2                   ;// Load byte wise to avoid unaligned access
90        AND      strOffset, cstOffset, Flag, LSL #1  ;// strOffset = (Flag & 15) < 1;
91        LDRSBEQ  Value, [pSrc], #1                   ;// Value = (OMX_U8)  *pSrc++
92        ORRNE    Value,Value,Value2, LSL #8          ;// Value = (OMX_U16) *pSrc++
93
94        TST      Flag,  #0x20                        ;// Computing (Flag & 0x20) to check, if we're done
95        LDRBEQ   Flag,  [pSrc], #1                   ;// Flag  = (OMX_U8) *pSrc++, for next iteration
96        STRH     Value, [pDst, strOffset]            ;// Store <Value> at offset <strOffset>
97        BEQ      unpackLoop                          ;// Branch to the loop beginning
98
99        ;//--------------------------------------------------
100        ;//InvTransformDC2x2: Inlined (Implemented in ARM V6)
101        ;//--------------------------------------------------
102
103        LDMIA    pDst, {r0w0, r0w1}                  ;// r0w0 = |c1|c0| & r0w1 = |c3|c2|
104
105        STR      pSrc, [ppSrc]                       ;// Update the bitstream pointer
106
107        LDR      pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer
108        LDR      pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
109
110        SADDSUBX r0w0, r0w0,  r0w0                   ;// [ c00+c01, c00-c01 ]
111        SADDSUBX r0w1, r0w1,  r0w1                   ;// [ c10+c11, c10-c11 ]
112
113        LDRSB    Shift, [pQPDivTable, QP]            ;// Shift = pQPDivTable[QP]
114        LDRSB    Scale, [pQPModTable, QP]            ;// Scale = pQPModTable[QP]
115
116        SADD16   c0w0, r0w0, r0w1                    ;// [ d00+d10, d01+d11 ]
117        SSUB16   c1w0, r0w0, r0w1                    ;// [ d00-d10, d01-d11 ]
118
119        ;//-------------------------------------------------
120        ;//DequantChromaDC2x2: Inlined (Neon Implementation)
121        ;//-------------------------------------------------
122
123        LSL      Scale, Scale, Shift                 ;// Scale = Scale << Shift
124        VMOV     dInvTrCoeff, c0w0, c1w0
125        VREV32   dInvTrCoeff,dInvTrCoeff
126        VDUP     dScale,Scale
127
128        VMULL    qDqntCoeff,dInvTrCoeff,dScale
129        VSHRN    dDqntCoeff,qDqntCoeff,#1
130
131
132        VST1     dDqntCoeff,[pDst]                   ;// Storing all the coefficients at once
133
134        MOV      return, #OMX_Sts_NoErr
135        M_END
136
137    ENDIF ;// CortexA8
138
139
140    END
141