10c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Copyright (C) 2007 ARM Limited
378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Licensed under the Apache License, Version 2.0 (the "License");
578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// you may not use this file except in compliance with the License.
678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// You may obtain a copy of the License at
778e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
878e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//      http://www.apache.org/licenses/LICENSE-2.0
978e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
1078e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Unless required by applicable law or agreed to in writing, software
1178e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// distributed under the License is distributed on an "AS IS" BASIS,
1278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// See the License for the specific language governing permissions and
1478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// limitations under the License.
1578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
1678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
170c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Description:
190c1bc742181ded4930842b46e9507372f0b1b963James Dong;// H.264 inverse quantize and transform module
200c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
210c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
220c1bc742181ded4930842b46e9507372f0b1b963James Dong
230c1bc742181ded4930842b46e9507372f0b1b963James Dong
240c1bc742181ded4930842b46e9507372f0b1b963James Dong
250c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Include standard headers
260c1bc742181ded4930842b46e9507372f0b1b963James Dong
270c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE omxtypes_s.h
280c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE armCOMM_s.h
290c1bc742181ded4930842b46e9507372f0b1b963James Dong
300c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Import symbols required from other files
310c1bc742181ded4930842b46e9507372f0b1b963James Dong;// (For example tables)
320c1bc742181ded4930842b46e9507372f0b1b963James Dong
330c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT armVCM4P10_UnpackBlock4x4
340c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT armVCM4P10_TransformResidual4x4
350c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT armVCM4P10_QPDivTable
360c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT armVCM4P10_VMatrixU16
370c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT armVCM4P10_QPModuloTable
380c1bc742181ded4930842b46e9507372f0b1b963James Dong
390c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_VARIANTS ARM1136JS, ARM1136JS_U
400c1bc742181ded4930842b46e9507372f0b1b963James Dong
410c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Set debugging level
420c1bc742181ded4930842b46e9507372f0b1b963James Dong;//DEBUG_ON    SETL {TRUE}
430c1bc742181ded4930842b46e9507372f0b1b963James Dong
440c1bc742181ded4930842b46e9507372f0b1b963James Dong
450c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Static Function: armVCM4P10_DequantLumaAC4x4
460c1bc742181ded4930842b46e9507372f0b1b963James Dong
470c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
480c1bc742181ded4930842b46e9507372f0b1b963James Dong
490c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF  ARM1136JS
500c1bc742181ded4930842b46e9507372f0b1b963James Dong
510c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Input Registers
520c1bc742181ded4930842b46e9507372f0b1b963James DongpSrcDst       RN  0
530c1bc742181ded4930842b46e9507372f0b1b963James DongQP            RN  1
540c1bc742181ded4930842b46e9507372f0b1b963James Dong
550c1bc742181ded4930842b46e9507372f0b1b963James Dong
560c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Output Registers
570c1bc742181ded4930842b46e9507372f0b1b963James Dong
580c1bc742181ded4930842b46e9507372f0b1b963James Dong
590c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Local Scratch Registers
600c1bc742181ded4930842b46e9507372f0b1b963James DongpQPdiv          RN  4
610c1bc742181ded4930842b46e9507372f0b1b963James DongpQPmod          RN  5
620c1bc742181ded4930842b46e9507372f0b1b963James DongpVRow           RN  2
630c1bc742181ded4930842b46e9507372f0b1b963James DongQPmod           RN  6
640c1bc742181ded4930842b46e9507372f0b1b963James Dongshift           RN  3
650c1bc742181ded4930842b46e9507372f0b1b963James DongrowLuma01       RN  1
660c1bc742181ded4930842b46e9507372f0b1b963James DongrowLuma23       RN  4
670c1bc742181ded4930842b46e9507372f0b1b963James Dong
680c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst00        RN  5
690c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst02        RN  6
700c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst10        RN  7
710c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst12        RN  8
720c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst20        RN  9
730c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst22        RN  10
740c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst30        RN  11
750c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst32        RN  12
760c1bc742181ded4930842b46e9507372f0b1b963James Dong
770c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp1           RN  2
780c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp2           RN  3
790c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp3           RN  14
800c1bc742181ded4930842b46e9507372f0b1b963James Dong
810c1bc742181ded4930842b46e9507372f0b1b963James Dong
820c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Allocate stack memory required by the function
830c1bc742181ded4930842b46e9507372f0b1b963James Dong
840c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function header
850c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_START armVCM4P10_DequantLumaAC4x4,r11
860c1bc742181ded4930842b46e9507372f0b1b963James Dong
870c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    pQPmod,=armVCM4P10_QPModuloTable
880c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    pQPdiv,=armVCM4P10_QPDivTable
890c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    pVRow,=armVCM4P10_VMatrixU16
900c1bc742181ded4930842b46e9507372f0b1b963James Dong
910c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
920c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
930c1bc742181ded4930842b46e9507372f0b1b963James Dong
940c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRH    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [00|0a]
950c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRH    temp3,[pVRow,#2]                     ;// temp3     = [00|0b]
960c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRH    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [00|0c]
970c1bc742181ded4930842b46e9507372f0b1b963James Dong        ORR     rowLuma01,rowLuma01,temp3,LSL #16    ;// rowLuma01 = [0b|0a]
980c1bc742181ded4930842b46e9507372f0b1b963James Dong
990c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load all the 16 'src' values
1000c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
1010c1bc742181ded4930842b46e9507372f0b1b963James Dong
1020c1bc742181ded4930842b46e9507372f0b1b963James Dong
1030c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//*********************************************************************************************
1040c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1050c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// 'Shift' ranges between [0,8]
1060c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
1070c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1080c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//*********************************************************************************************
1090c1bc742181ded4930842b46e9507372f0b1b963James Dong
1100c1bc742181ded4930842b46e9507372f0b1b963James Dong        LSL    rowLuma01,rowLuma01,shift
1110c1bc742181ded4930842b46e9507372f0b1b963James Dong        LSL    rowLuma23,rowLuma23,shift
1120c1bc742181ded4930842b46e9507372f0b1b963James Dong
1130c1bc742181ded4930842b46e9507372f0b1b963James Dong
1140c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//**********************************************************************************************
1150c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1160c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// The idea is to unroll the Loop completely
1170c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
1180c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
1190c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
1200c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
1210c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
1220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
1230c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
1240c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// We then pack the two 16 bit multiplication result into a word and store at one go
1260c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1270c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//**********************************************************************************************
1280c1bc742181ded4930842b46e9507372f0b1b963James Dong
1290c1bc742181ded4930842b46e9507372f0b1b963James Dong
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 1
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift)
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift)
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
1380c1bc742181ded4930842b46e9507372f0b1b963James Dong
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 2
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 3
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift)
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong
1650c1bc742181ded4930842b46e9507372f0b1b963James Dong
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 4
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
1720c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
1730c1bc742181ded4930842b46e9507372f0b1b963James Dong
1740c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
1750c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
1760c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
1770c1bc742181ded4930842b46e9507372f0b1b963James Dong
1780c1bc742181ded4930842b46e9507372f0b1b963James Dong
1790c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
1800c1bc742181ded4930842b46e9507372f0b1b963James Dong
1810c1bc742181ded4930842b46e9507372f0b1b963James Dong
1820c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Set return value
1830c1bc742181ded4930842b46e9507372f0b1b963James Dong
1840c1bc742181ded4930842b46e9507372f0b1b963James Dong
1850c1bc742181ded4930842b46e9507372f0b1b963James Dong
1860c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function tail
1870c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_END
1880c1bc742181ded4930842b46e9507372f0b1b963James Dong
1890c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF                                                    ;//ARM1136JS
1900c1bc742181ded4930842b46e9507372f0b1b963James Dong
1910c1bc742181ded4930842b46e9507372f0b1b963James Dong
1920c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
1930c1bc742181ded4930842b46e9507372f0b1b963James Dong
1940c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF  ARM1136JS_U
1950c1bc742181ded4930842b46e9507372f0b1b963James Dong
1960c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Input Registers
1970c1bc742181ded4930842b46e9507372f0b1b963James DongpSrcDst       RN  0
1980c1bc742181ded4930842b46e9507372f0b1b963James DongQP            RN  1
1990c1bc742181ded4930842b46e9507372f0b1b963James Dong
2000c1bc742181ded4930842b46e9507372f0b1b963James Dong
2010c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Output Registers
2020c1bc742181ded4930842b46e9507372f0b1b963James Dong
2030c1bc742181ded4930842b46e9507372f0b1b963James Dong
2040c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Local Scratch Registers
2050c1bc742181ded4930842b46e9507372f0b1b963James DongpQPdiv          RN  4
2060c1bc742181ded4930842b46e9507372f0b1b963James DongpQPmod          RN  5
2070c1bc742181ded4930842b46e9507372f0b1b963James DongpVRow           RN  2
2080c1bc742181ded4930842b46e9507372f0b1b963James DongQPmod           RN  6
2090c1bc742181ded4930842b46e9507372f0b1b963James Dongshift           RN  3
2100c1bc742181ded4930842b46e9507372f0b1b963James DongrowLuma01       RN  1
2110c1bc742181ded4930842b46e9507372f0b1b963James DongrowLuma23       RN  4
2120c1bc742181ded4930842b46e9507372f0b1b963James Dong
2130c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst00        RN  5
2140c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst02        RN  6
2150c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst10        RN  7
2160c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst12        RN  8
2170c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst20        RN  9
2180c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst22        RN  10
2190c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst30        RN  11
2200c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst32        RN  12
2210c1bc742181ded4930842b46e9507372f0b1b963James Dong
2220c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp1           RN  2
2230c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp2           RN  3
2240c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp3           RN  14
2250c1bc742181ded4930842b46e9507372f0b1b963James Dong
2260c1bc742181ded4930842b46e9507372f0b1b963James Dong
2270c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Allocate stack memory required by the function
2280c1bc742181ded4930842b46e9507372f0b1b963James Dong
2290c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function header
2300c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_START armVCM4P10_DequantLumaAC4x4,r11
2310c1bc742181ded4930842b46e9507372f0b1b963James Dong
2320c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    pQPmod,=armVCM4P10_QPModuloTable
2330c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    pQPdiv,=armVCM4P10_QPDivTable
2340c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    pVRow,=armVCM4P10_VMatrixU16
2350c1bc742181ded4930842b46e9507372f0b1b963James Dong
2360c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
2370c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
2380c1bc742181ded4930842b46e9507372f0b1b963James Dong
2390c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [0b|0a]
2400c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [0d|0c]
2410c1bc742181ded4930842b46e9507372f0b1b963James Dong
2420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load all the 16 'src' values
2430c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
2440c1bc742181ded4930842b46e9507372f0b1b963James Dong
2450c1bc742181ded4930842b46e9507372f0b1b963James Dong
2460c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//*********************************************************************************************
2470c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
2480c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// 'Shift' ranges between [0,8]
2490c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
2500c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
2510c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//*********************************************************************************************
2520c1bc742181ded4930842b46e9507372f0b1b963James Dong
2530c1bc742181ded4930842b46e9507372f0b1b963James Dong        LSL    rowLuma01,rowLuma01,shift
2540c1bc742181ded4930842b46e9507372f0b1b963James Dong        LSL    rowLuma23,rowLuma23,shift
2550c1bc742181ded4930842b46e9507372f0b1b963James Dong
2560c1bc742181ded4930842b46e9507372f0b1b963James Dong
2570c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//**********************************************************************************************
2580c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
2590c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// The idea is to unroll the Loop completely
2600c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
2610c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
2620c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
2630c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
2640c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
2650c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
2660c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
2670c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
2680c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// We then pack the two 16 bit multiplication result into a word and store at one go
2690c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
2700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//**********************************************************************************************
2710c1bc742181ded4930842b46e9507372f0b1b963James Dong
2720c1bc742181ded4930842b46e9507372f0b1b963James Dong
2730c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 1
2740c1bc742181ded4930842b46e9507372f0b1b963James Dong
2750c1bc742181ded4930842b46e9507372f0b1b963James Dong
2760c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift)
2770c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)
2780c1bc742181ded4930842b46e9507372f0b1b963James Dong
2790c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift)
2800c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
2810c1bc742181ded4930842b46e9507372f0b1b963James Dong
2820c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
2830c1bc742181ded4930842b46e9507372f0b1b963James Dong
2840c1bc742181ded4930842b46e9507372f0b1b963James Dong
2850c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 2
2860c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
2870c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
2880c1bc742181ded4930842b46e9507372f0b1b963James Dong
2890c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
2900c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
2910c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
2920c1bc742181ded4930842b46e9507372f0b1b963James Dong
2930c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
2940c1bc742181ded4930842b46e9507372f0b1b963James Dong
2950c1bc742181ded4930842b46e9507372f0b1b963James Dong
2960c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 3
2970c1bc742181ded4930842b46e9507372f0b1b963James Dong
2980c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)
2990c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)
3000c1bc742181ded4930842b46e9507372f0b1b963James Dong
3010c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
3020c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift)
3030c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
3040c1bc742181ded4930842b46e9507372f0b1b963James Dong
3050c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
3060c1bc742181ded4930842b46e9507372f0b1b963James Dong
3070c1bc742181ded4930842b46e9507372f0b1b963James Dong
3080c1bc742181ded4930842b46e9507372f0b1b963James Dong
3090c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 4
3100c1bc742181ded4930842b46e9507372f0b1b963James Dong
3110c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
3120c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
3130c1bc742181ded4930842b46e9507372f0b1b963James Dong
3140c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
3150c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
3160c1bc742181ded4930842b46e9507372f0b1b963James Dong
3170c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
3180c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
3190c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
3200c1bc742181ded4930842b46e9507372f0b1b963James Dong
3210c1bc742181ded4930842b46e9507372f0b1b963James Dong
3220c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
3230c1bc742181ded4930842b46e9507372f0b1b963James Dong
3240c1bc742181ded4930842b46e9507372f0b1b963James Dong
3250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Set return value
3260c1bc742181ded4930842b46e9507372f0b1b963James Dong
3270c1bc742181ded4930842b46e9507372f0b1b963James Dong
3280c1bc742181ded4930842b46e9507372f0b1b963James Dong
3290c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function tail
3300c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_END
3310c1bc742181ded4930842b46e9507372f0b1b963James Dong
3320c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF                                                    ;//ARM1136JS_U
3330c1bc742181ded4930842b46e9507372f0b1b963James Dong
3340c1bc742181ded4930842b46e9507372f0b1b963James Dong
3350c1bc742181ded4930842b46e9507372f0b1b963James Dong
3360c1bc742181ded4930842b46e9507372f0b1b963James Dong
3370c1bc742181ded4930842b46e9507372f0b1b963James Dong
3380c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
3390c1bc742181ded4930842b46e9507372f0b1b963James Dong
3400c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
3410c1bc742181ded4930842b46e9507372f0b1b963James Dong
3420c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF  ARM1136JS
3430c1bc742181ded4930842b46e9507372f0b1b963James Dong
3440c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Input Registers
3450c1bc742181ded4930842b46e9507372f0b1b963James DongppSrc       RN  0
3460c1bc742181ded4930842b46e9507372f0b1b963James DongpPred       RN  1
3470c1bc742181ded4930842b46e9507372f0b1b963James DongpDC         RN  2
3480c1bc742181ded4930842b46e9507372f0b1b963James DongpDst        RN  3
3490c1bc742181ded4930842b46e9507372f0b1b963James Dong
3500c1bc742181ded4930842b46e9507372f0b1b963James Dong
3510c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Output Registers
3520c1bc742181ded4930842b46e9507372f0b1b963James Dongresult      RN  0
3530c1bc742181ded4930842b46e9507372f0b1b963James Dong
3540c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Local Scratch Registers
3550c1bc742181ded4930842b46e9507372f0b1b963James DongpDelta      RN  4
3560c1bc742181ded4930842b46e9507372f0b1b963James DongpDeltaTmp   RN  6
3570c1bc742181ded4930842b46e9507372f0b1b963James DongAC          RN  5                   ;//Load from stack
3580c1bc742181ded4930842b46e9507372f0b1b963James DongpPredTemp   RN  7
3590c1bc742181ded4930842b46e9507372f0b1b963James DongpDCTemp     RN  8
3600c1bc742181ded4930842b46e9507372f0b1b963James DongpDstTemp    RN  9
3610c1bc742181ded4930842b46e9507372f0b1b963James DongpDeltaArg1  RN  1
3620c1bc742181ded4930842b46e9507372f0b1b963James DongpDeltaArg0  RN  0
3630c1bc742181ded4930842b46e9507372f0b1b963James DongQP          RN  1                   ;//Load from stack
3640c1bc742181ded4930842b46e9507372f0b1b963James DongDCval       RN  10
3650c1bc742181ded4930842b46e9507372f0b1b963James DongDCvalCopy   RN  11
3660c1bc742181ded4930842b46e9507372f0b1b963James Dongpredstep    RN  1
3670c1bc742181ded4930842b46e9507372f0b1b963James DongdstStep     RN  10
3680c1bc742181ded4930842b46e9507372f0b1b963James Dongycounter    RN  0
3690c1bc742181ded4930842b46e9507372f0b1b963James DongPredVal1    RN  3
3700c1bc742181ded4930842b46e9507372f0b1b963James DongPredVal2    RN  5
3710c1bc742181ded4930842b46e9507372f0b1b963James DongDeltaVal1   RN  2
3720c1bc742181ded4930842b46e9507372f0b1b963James DongDeltaVal2   RN  11
3730c1bc742181ded4930842b46e9507372f0b1b963James DongPredVal     RN  8
3740c1bc742181ded4930842b46e9507372f0b1b963James DongtmpDeltaVal RN  6
3750c1bc742181ded4930842b46e9507372f0b1b963James Dongsum1        RN  12
3760c1bc742181ded4930842b46e9507372f0b1b963James Dongsum2        RN  14
3770c1bc742181ded4930842b46e9507372f0b1b963James Dong
3780c1bc742181ded4930842b46e9507372f0b1b963James Dong
3790c1bc742181ded4930842b46e9507372f0b1b963James Dong
3800c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Allocate stack memory required by the function
3810c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ALLOC8 pBuffer, 32
3820c1bc742181ded4930842b46e9507372f0b1b963James Dong
3830c1bc742181ded4930842b46e9507372f0b1b963James Dong
3840c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Write function header
3850c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11
3860c1bc742181ded4930842b46e9507372f0b1b963James Dong
3870c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Define stack arguments
3880c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ARG   predStepOnStack, 4
3890c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ARG   dstStepOnStack,4
3900c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ARG   QPOnStack, 4
3910c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ARG   ACOnStack,4
3920c1bc742181ded4930842b46e9507372f0b1b963James Dong
3930c1bc742181ded4930842b46e9507372f0b1b963James Dong
3940c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ADR   pDelta,pBuffer
3950c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   AC,ACOnStack
3960c1bc742181ded4930842b46e9507372f0b1b963James Dong
3970c1bc742181ded4930842b46e9507372f0b1b963James Dong
3980c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Save registers r1,r2,r3 before function call
3990c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pPredTemp,pPred
4000c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pDCTemp,pDC
4010c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pDstTemp,pDst
4020c1bc742181ded4930842b46e9507372f0b1b963James Dong
4030c1bc742181ded4930842b46e9507372f0b1b963James Dong        CMP     AC,#0
4040c1bc742181ded4930842b46e9507372f0b1b963James Dong        BEQ     DCcase
4050c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4
4060c1bc742181ded4930842b46e9507372f0b1b963James Dong
4070c1bc742181ded4930842b46e9507372f0b1b963James Dong        BL      armVCM4P10_UnpackBlock4x4
4080c1bc742181ded4930842b46e9507372f0b1b963James Dong
4090c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   QP,QPOnStack                                ;// Set up r1 for DequantLumaAC4x4
4100c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pDeltaArg0,pDelta                           ;// Set up r0 for DequantLumaAC4x4
4110c1bc742181ded4930842b46e9507372f0b1b963James Dong
4120c1bc742181ded4930842b46e9507372f0b1b963James Dong        BL      armVCM4P10_DequantLumaAC4x4
4130c1bc742181ded4930842b46e9507372f0b1b963James Dong
4140c1bc742181ded4930842b46e9507372f0b1b963James Dong
4150c1bc742181ded4930842b46e9507372f0b1b963James Dong        CMP     pDCTemp,#0
4160c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRSHNE DCval,[pDCTemp]
4170c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pDeltaArg0,pDelta                           ;// Set up r0 for armVCM4P10_TransformResidual4x4
4180c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_TransformResidual4x4
4190c1bc742181ded4930842b46e9507372f0b1b963James Dong        STRHNE  DCval,[pDelta]
4200c1bc742181ded4930842b46e9507372f0b1b963James Dong
4210c1bc742181ded4930842b46e9507372f0b1b963James Dong        BL      armVCM4P10_TransformResidual4x4
4220c1bc742181ded4930842b46e9507372f0b1b963James Dong        B       OutDCcase
4230c1bc742181ded4930842b46e9507372f0b1b963James Dong
4240c1bc742181ded4930842b46e9507372f0b1b963James Dong
4250c1bc742181ded4930842b46e9507372f0b1b963James DongDCcase
4260c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRSH   DCval,[pDCTemp]
4270c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD     DCval,DCval,#32
4280c1bc742181ded4930842b46e9507372f0b1b963James Dong        ASR     DCval,DCval,#6
4290c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   DCval,DCval,DCval,LSL #16                  ;// Duplicating the Lower halfword
4300c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     DCvalCopy, DCval                           ;// Needed for STRD
4310c1bc742181ded4930842b46e9507372f0b1b963James Dong        STRD    DCval, [pDelta, #0]                        ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
4320c1bc742181ded4930842b46e9507372f0b1b963James Dong        STRD    DCval, [pDelta, #8]                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
4330c1bc742181ded4930842b46e9507372f0b1b963James Dong        STRD    DCval, [pDelta, #16]                       ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
4340c1bc742181ded4930842b46e9507372f0b1b963James Dong        STRD    DCval, [pDelta, #24]
4350c1bc742181ded4930842b46e9507372f0b1b963James Dong
4360c1bc742181ded4930842b46e9507372f0b1b963James Dong
4370c1bc742181ded4930842b46e9507372f0b1b963James DongOutDCcase
4380c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   predstep,predStepOnStack
4390c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   dstStep,dstStepOnStack
4400c1bc742181ded4930842b46e9507372f0b1b963James Dong
4410c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDMIA   pDelta!,{tmpDeltaVal,DeltaVal2}             ;// Pre load
4420c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     ycounter,#4                                 ;// Counter for the PredPlusDeltaLoop
4430c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     PredVal,[pPredTemp]                         ;// Pre load
4440c1bc742181ded4930842b46e9507372f0b1b963James Dong
4450c1bc742181ded4930842b46e9507372f0b1b963James DongPredPlusDeltaLoop
4460c1bc742181ded4930842b46e9507372f0b1b963James Dong
4470c1bc742181ded4930842b46e9507372f0b1b963James Dong
4480c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUBS    ycounter,ycounter,#1
4490c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD     pPredTemp,pPredTemp,predstep                ;// Increment pPred ptr
4500c1bc742181ded4930842b46e9507372f0b1b963James Dong
4510c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16     ;// Deltaval1 = [C A]
4520c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16     ;// DeltaVal2 = [D B]
4530c1bc742181ded4930842b46e9507372f0b1b963James Dong
4540c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTB16  PredVal1,PredVal                            ;// PredVal1 = [0c0a]
4550c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTB16  PredVal2,PredVal,ROR #8                     ;// PredVal2 = [0d0b]
4560c1bc742181ded4930842b46e9507372f0b1b963James Dong
4570c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRGT   PredVal,[pPredTemp]                         ;// Pre load
4580c1bc742181ded4930842b46e9507372f0b1b963James Dong
4590c1bc742181ded4930842b46e9507372f0b1b963James Dong        QADD16  sum2,DeltaVal2,PredVal2                     ;// Add and saturate to 16 bits
4600c1bc742181ded4930842b46e9507372f0b1b963James Dong        QADD16  sum1,DeltaVal1,PredVal1
4610c1bc742181ded4930842b46e9507372f0b1b963James Dong
4620c1bc742181ded4930842b46e9507372f0b1b963James Dong        USAT16  sum2,#8,sum2                                ;// armClip(0,255,sum2)
4630c1bc742181ded4930842b46e9507372f0b1b963James Dong        USAT16  sum1,#8,sum1
4640c1bc742181ded4930842b46e9507372f0b1b963James Dong
4650c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDMGTIA   pDelta!,{tmpDeltaVal,DeltaVal2}           ;// Pre load
4660c1bc742181ded4930842b46e9507372f0b1b963James Dong
4670c1bc742181ded4930842b46e9507372f0b1b963James Dong        ORR     sum1,sum1,sum2,LSL #8                       ;// sum1 = [dcba]
4680c1bc742181ded4930842b46e9507372f0b1b963James Dong        STR     sum1,[pDstTemp]
4690c1bc742181ded4930842b46e9507372f0b1b963James Dong
4700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD     pDstTemp,pDstTemp,dstStep                   ;// Increment pDst ptr
4710c1bc742181ded4930842b46e9507372f0b1b963James Dong        BGT     PredPlusDeltaLoop
4720c1bc742181ded4930842b46e9507372f0b1b963James Dong
4730c1bc742181ded4930842b46e9507372f0b1b963James Dong
4740c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Set return value
4750c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     result,#OMX_Sts_NoErr
4760c1bc742181ded4930842b46e9507372f0b1b963James Dong
4770c1bc742181ded4930842b46e9507372f0b1b963James DongEnd
4780c1bc742181ded4930842b46e9507372f0b1b963James Dong
4790c1bc742181ded4930842b46e9507372f0b1b963James Dong
4800c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function tail
4810c1bc742181ded4930842b46e9507372f0b1b963James Dong
4820c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_END
4830c1bc742181ded4930842b46e9507372f0b1b963James Dong
4840c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF                                                    ;//ARM1136JS
4850c1bc742181ded4930842b46e9507372f0b1b963James Dong
4860c1bc742181ded4930842b46e9507372f0b1b963James Dong
4870c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
4880c1bc742181ded4930842b46e9507372f0b1b963James Dong
4890c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
4900c1bc742181ded4930842b46e9507372f0b1b963James Dong
4910c1bc742181ded4930842b46e9507372f0b1b963James Dong
4920c1bc742181ded4930842b46e9507372f0b1b963James Dong
4930c1bc742181ded4930842b46e9507372f0b1b963James Dong
4940c1bc742181ded4930842b46e9507372f0b1b963James Dong    END
495