omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
10c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
20c1bc742181ded4930842b46e9507372f0b1b963James Dong;// (c) Copyright 2007 ARM Limited. All Rights Reserved.
30c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
40c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Description:
50c1bc742181ded4930842b46e9507372f0b1b963James Dong;// H.264 inverse quantize and transform module
60c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
70c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
80c1bc742181ded4930842b46e9507372f0b1b963James Dong
90c1bc742181ded4930842b46e9507372f0b1b963James Dong
100c1bc742181ded4930842b46e9507372f0b1b963James Dong
110c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Include standard headers
120c1bc742181ded4930842b46e9507372f0b1b963James Dong
130c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE omxtypes_s.h
140c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE armCOMM_s.h
150c1bc742181ded4930842b46e9507372f0b1b963James Dong
160c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Import symbols required from other files
170c1bc742181ded4930842b46e9507372f0b1b963James Dong;// (For example tables)
180c1bc742181ded4930842b46e9507372f0b1b963James Dong
190c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT armVCM4P10_UnpackBlock4x4
200c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT armVCM4P10_TransformResidual4x4
210c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT armVCM4P10_QPDivTable
220c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT armVCM4P10_VMatrixU16
230c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT armVCM4P10_QPModuloTable
240c1bc742181ded4930842b46e9507372f0b1b963James Dong
250c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_VARIANTS ARM1136JS, ARM1136JS_U
260c1bc742181ded4930842b46e9507372f0b1b963James Dong
270c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Set debugging level
280c1bc742181ded4930842b46e9507372f0b1b963James Dong;//DEBUG_ON    SETL {TRUE}
290c1bc742181ded4930842b46e9507372f0b1b963James Dong
300c1bc742181ded4930842b46e9507372f0b1b963James Dong
310c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Static Function: armVCM4P10_DequantLumaAC4x4
320c1bc742181ded4930842b46e9507372f0b1b963James Dong
330c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
340c1bc742181ded4930842b46e9507372f0b1b963James Dong
350c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF  ARM1136JS
360c1bc742181ded4930842b46e9507372f0b1b963James Dong
370c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Input Registers
380c1bc742181ded4930842b46e9507372f0b1b963James DongpSrcDst       RN  0
390c1bc742181ded4930842b46e9507372f0b1b963James DongQP            RN  1
400c1bc742181ded4930842b46e9507372f0b1b963James Dong
410c1bc742181ded4930842b46e9507372f0b1b963James Dong
420c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Output Registers
430c1bc742181ded4930842b46e9507372f0b1b963James Dong
440c1bc742181ded4930842b46e9507372f0b1b963James Dong
450c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Local Scratch Registers
460c1bc742181ded4930842b46e9507372f0b1b963James DongpQPdiv          RN  4
470c1bc742181ded4930842b46e9507372f0b1b963James DongpQPmod          RN  5
480c1bc742181ded4930842b46e9507372f0b1b963James DongpVRow           RN  2
490c1bc742181ded4930842b46e9507372f0b1b963James DongQPmod           RN  6
500c1bc742181ded4930842b46e9507372f0b1b963James Dongshift           RN  3
510c1bc742181ded4930842b46e9507372f0b1b963James DongrowLuma01       RN  1
520c1bc742181ded4930842b46e9507372f0b1b963James DongrowLuma23       RN  4
530c1bc742181ded4930842b46e9507372f0b1b963James Dong
540c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst00        RN  5
550c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst02        RN  6
560c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst10        RN  7
570c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst12        RN  8
580c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst20        RN  9
590c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst22        RN  10
600c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst30        RN  11
610c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst32        RN  12
620c1bc742181ded4930842b46e9507372f0b1b963James Dong
630c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp1           RN  2
640c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp2           RN  3
650c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp3           RN  14
660c1bc742181ded4930842b46e9507372f0b1b963James Dong
670c1bc742181ded4930842b46e9507372f0b1b963James Dong
680c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Allocate stack memory required by the function
690c1bc742181ded4930842b46e9507372f0b1b963James Dong
700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function header
710c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_START armVCM4P10_DequantLumaAC4x4,r11
720c1bc742181ded4930842b46e9507372f0b1b963James Dong
730c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    pQPmod,=armVCM4P10_QPModuloTable
740c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    pQPdiv,=armVCM4P10_QPDivTable
750c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    pVRow,=armVCM4P10_VMatrixU16
760c1bc742181ded4930842b46e9507372f0b1b963James Dong
770c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
780c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
790c1bc742181ded4930842b46e9507372f0b1b963James Dong
800c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRH    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [00|0a]
810c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRH    temp3,[pVRow,#2]                     ;// temp3     = [00|0b]
820c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRH    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [00|0c]
830c1bc742181ded4930842b46e9507372f0b1b963James Dong        ORR     rowLuma01,rowLuma01,temp3,LSL #16    ;// rowLuma01 = [0b|0a]
840c1bc742181ded4930842b46e9507372f0b1b963James Dong
850c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load all the 16 'src' values
860c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
870c1bc742181ded4930842b46e9507372f0b1b963James Dong
880c1bc742181ded4930842b46e9507372f0b1b963James Dong
890c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//*********************************************************************************************
900c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
910c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// 'Shift' ranges between [0,8]
920c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
930c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
940c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//*********************************************************************************************
950c1bc742181ded4930842b46e9507372f0b1b963James Dong
960c1bc742181ded4930842b46e9507372f0b1b963James Dong        LSL    rowLuma01,rowLuma01,shift
970c1bc742181ded4930842b46e9507372f0b1b963James Dong        LSL    rowLuma23,rowLuma23,shift
980c1bc742181ded4930842b46e9507372f0b1b963James Dong
990c1bc742181ded4930842b46e9507372f0b1b963James Dong
1000c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//**********************************************************************************************
1010c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1020c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// The idea is to unroll the Loop completely
1030c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
1040c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
1050c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
1060c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
1070c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
1080c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
1090c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
1100c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1110c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// We then pack the two 16 bit multiplication result into a word and store at one go
1120c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1130c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//**********************************************************************************************
1140c1bc742181ded4930842b46e9507372f0b1b963James Dong
1150c1bc742181ded4930842b46e9507372f0b1b963James Dong
1160c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 1
1170c1bc742181ded4930842b46e9507372f0b1b963James Dong
1180c1bc742181ded4930842b46e9507372f0b1b963James Dong
1190c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift)
1200c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)
1210c1bc742181ded4930842b46e9507372f0b1b963James Dong
1220c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift)
1230c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
1240c1bc742181ded4930842b46e9507372f0b1b963James Dong
1250c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
1260c1bc742181ded4930842b46e9507372f0b1b963James Dong
1270c1bc742181ded4930842b46e9507372f0b1b963James Dong
1280c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 2
1290c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong
1380c1bc742181ded4930842b46e9507372f0b1b963James Dong
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 3
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift)
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 4
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong
1650c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Set return value
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong
1720c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function tail
1730c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_END
1740c1bc742181ded4930842b46e9507372f0b1b963James Dong
1750c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF                                                    ;//ARM1136JS
1760c1bc742181ded4930842b46e9507372f0b1b963James Dong
1770c1bc742181ded4930842b46e9507372f0b1b963James Dong
1780c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
1790c1bc742181ded4930842b46e9507372f0b1b963James Dong
1800c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF  ARM1136JS_U
1810c1bc742181ded4930842b46e9507372f0b1b963James Dong
1820c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Input Registers
1830c1bc742181ded4930842b46e9507372f0b1b963James DongpSrcDst       RN  0
1840c1bc742181ded4930842b46e9507372f0b1b963James DongQP            RN  1
1850c1bc742181ded4930842b46e9507372f0b1b963James Dong
1860c1bc742181ded4930842b46e9507372f0b1b963James Dong
1870c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Output Registers
1880c1bc742181ded4930842b46e9507372f0b1b963James Dong
1890c1bc742181ded4930842b46e9507372f0b1b963James Dong
1900c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Local Scratch Registers
1910c1bc742181ded4930842b46e9507372f0b1b963James DongpQPdiv          RN  4
1920c1bc742181ded4930842b46e9507372f0b1b963James DongpQPmod          RN  5
1930c1bc742181ded4930842b46e9507372f0b1b963James DongpVRow           RN  2
1940c1bc742181ded4930842b46e9507372f0b1b963James DongQPmod           RN  6
1950c1bc742181ded4930842b46e9507372f0b1b963James Dongshift           RN  3
1960c1bc742181ded4930842b46e9507372f0b1b963James DongrowLuma01       RN  1
1970c1bc742181ded4930842b46e9507372f0b1b963James DongrowLuma23       RN  4
1980c1bc742181ded4930842b46e9507372f0b1b963James Dong
1990c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst00        RN  5
2000c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst02        RN  6
2010c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst10        RN  7
2020c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst12        RN  8
2030c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst20        RN  9
2040c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst22        RN  10
2050c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst30        RN  11
2060c1bc742181ded4930842b46e9507372f0b1b963James DongSrcDst32        RN  12
2070c1bc742181ded4930842b46e9507372f0b1b963James Dong
2080c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp1           RN  2
2090c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp2           RN  3
2100c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp3           RN  14
2110c1bc742181ded4930842b46e9507372f0b1b963James Dong
2120c1bc742181ded4930842b46e9507372f0b1b963James Dong
2130c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Allocate stack memory required by the function
2140c1bc742181ded4930842b46e9507372f0b1b963James Dong
2150c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function header
2160c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_START armVCM4P10_DequantLumaAC4x4,r11
2170c1bc742181ded4930842b46e9507372f0b1b963James Dong
2180c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    pQPmod,=armVCM4P10_QPModuloTable
2190c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    pQPdiv,=armVCM4P10_QPDivTable
2200c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    pVRow,=armVCM4P10_VMatrixU16
2210c1bc742181ded4930842b46e9507372f0b1b963James Dong
2220c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
2230c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
2240c1bc742181ded4930842b46e9507372f0b1b963James Dong
2250c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [0b|0a]
2260c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [0d|0c]
2270c1bc742181ded4930842b46e9507372f0b1b963James Dong
2280c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load all the 16 'src' values
2290c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
2300c1bc742181ded4930842b46e9507372f0b1b963James Dong
2310c1bc742181ded4930842b46e9507372f0b1b963James Dong
2320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//*********************************************************************************************
2330c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
2340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// 'Shift' ranges between [0,8]
2350c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
2360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
2370c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//*********************************************************************************************
2380c1bc742181ded4930842b46e9507372f0b1b963James Dong
2390c1bc742181ded4930842b46e9507372f0b1b963James Dong        LSL    rowLuma01,rowLuma01,shift
2400c1bc742181ded4930842b46e9507372f0b1b963James Dong        LSL    rowLuma23,rowLuma23,shift
2410c1bc742181ded4930842b46e9507372f0b1b963James Dong
2420c1bc742181ded4930842b46e9507372f0b1b963James Dong
2430c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//**********************************************************************************************
2440c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
2450c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// The idea is to unroll the Loop completely
2460c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
2470c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
2480c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
2490c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
2500c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
2510c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
2520c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
2530c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
2540c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// We then pack the two 16 bit multiplication result into a word and store at one go
2550c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
2560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//**********************************************************************************************
2570c1bc742181ded4930842b46e9507372f0b1b963James Dong
2580c1bc742181ded4930842b46e9507372f0b1b963James Dong
2590c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 1
2600c1bc742181ded4930842b46e9507372f0b1b963James Dong
2610c1bc742181ded4930842b46e9507372f0b1b963James Dong
2620c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift)
2630c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)
2640c1bc742181ded4930842b46e9507372f0b1b963James Dong
2650c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift)
2660c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
2670c1bc742181ded4930842b46e9507372f0b1b963James Dong
2680c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
2690c1bc742181ded4930842b46e9507372f0b1b963James Dong
2700c1bc742181ded4930842b46e9507372f0b1b963James Dong
2710c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 2
2720c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
2730c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
2740c1bc742181ded4930842b46e9507372f0b1b963James Dong
2750c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
2760c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
2770c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
2780c1bc742181ded4930842b46e9507372f0b1b963James Dong
2790c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
2800c1bc742181ded4930842b46e9507372f0b1b963James Dong
2810c1bc742181ded4930842b46e9507372f0b1b963James Dong
2820c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 3
2830c1bc742181ded4930842b46e9507372f0b1b963James Dong
2840c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)
2850c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)
2860c1bc742181ded4930842b46e9507372f0b1b963James Dong
2870c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
2880c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift)
2890c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
2900c1bc742181ded4930842b46e9507372f0b1b963James Dong
2910c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
2920c1bc742181ded4930842b46e9507372f0b1b963James Dong
2930c1bc742181ded4930842b46e9507372f0b1b963James Dong
2940c1bc742181ded4930842b46e9507372f0b1b963James Dong
2950c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row 4
2960c1bc742181ded4930842b46e9507372f0b1b963James Dong
2970c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
2980c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
2990c1bc742181ded4930842b46e9507372f0b1b963James Dong
3000c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
3010c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
3020c1bc742181ded4930842b46e9507372f0b1b963James Dong
3030c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
3040c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
3050c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
3060c1bc742181ded4930842b46e9507372f0b1b963James Dong
3070c1bc742181ded4930842b46e9507372f0b1b963James Dong
3080c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
3090c1bc742181ded4930842b46e9507372f0b1b963James Dong
3100c1bc742181ded4930842b46e9507372f0b1b963James Dong
3110c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Set return value
3120c1bc742181ded4930842b46e9507372f0b1b963James Dong
3130c1bc742181ded4930842b46e9507372f0b1b963James Dong
3140c1bc742181ded4930842b46e9507372f0b1b963James Dong
3150c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function tail
3160c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_END
3170c1bc742181ded4930842b46e9507372f0b1b963James Dong
3180c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF                                                    ;//ARM1136JS_U
3190c1bc742181ded4930842b46e9507372f0b1b963James Dong
3200c1bc742181ded4930842b46e9507372f0b1b963James Dong
3210c1bc742181ded4930842b46e9507372f0b1b963James Dong
3220c1bc742181ded4930842b46e9507372f0b1b963James Dong
3230c1bc742181ded4930842b46e9507372f0b1b963James Dong
3240c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
3250c1bc742181ded4930842b46e9507372f0b1b963James Dong
3260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
3270c1bc742181ded4930842b46e9507372f0b1b963James Dong
3280c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF  ARM1136JS
3290c1bc742181ded4930842b46e9507372f0b1b963James Dong
3300c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Input Registers
3310c1bc742181ded4930842b46e9507372f0b1b963James DongppSrc       RN  0
3320c1bc742181ded4930842b46e9507372f0b1b963James DongpPred       RN  1
3330c1bc742181ded4930842b46e9507372f0b1b963James DongpDC         RN  2
3340c1bc742181ded4930842b46e9507372f0b1b963James DongpDst        RN  3
3350c1bc742181ded4930842b46e9507372f0b1b963James Dong
3360c1bc742181ded4930842b46e9507372f0b1b963James Dong
3370c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Output Registers
3380c1bc742181ded4930842b46e9507372f0b1b963James Dongresult      RN  0
3390c1bc742181ded4930842b46e9507372f0b1b963James Dong
3400c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Local Scratch Registers
3410c1bc742181ded4930842b46e9507372f0b1b963James DongpDelta      RN  4
3420c1bc742181ded4930842b46e9507372f0b1b963James DongpDeltaTmp   RN  6
3430c1bc742181ded4930842b46e9507372f0b1b963James DongAC          RN  5                   ;//Load from stack
3440c1bc742181ded4930842b46e9507372f0b1b963James DongpPredTemp   RN  7
3450c1bc742181ded4930842b46e9507372f0b1b963James DongpDCTemp     RN  8
3460c1bc742181ded4930842b46e9507372f0b1b963James DongpDstTemp    RN  9
3470c1bc742181ded4930842b46e9507372f0b1b963James DongpDeltaArg1  RN  1
3480c1bc742181ded4930842b46e9507372f0b1b963James DongpDeltaArg0  RN  0
3490c1bc742181ded4930842b46e9507372f0b1b963James DongQP          RN  1                   ;//Load from stack
3500c1bc742181ded4930842b46e9507372f0b1b963James DongDCval       RN  10
3510c1bc742181ded4930842b46e9507372f0b1b963James DongDCvalCopy   RN  11
3520c1bc742181ded4930842b46e9507372f0b1b963James Dongpredstep    RN  1
3530c1bc742181ded4930842b46e9507372f0b1b963James DongdstStep     RN  10
3540c1bc742181ded4930842b46e9507372f0b1b963James Dongycounter    RN  0
3550c1bc742181ded4930842b46e9507372f0b1b963James DongPredVal1    RN  3
3560c1bc742181ded4930842b46e9507372f0b1b963James DongPredVal2    RN  5
3570c1bc742181ded4930842b46e9507372f0b1b963James DongDeltaVal1   RN  2
3580c1bc742181ded4930842b46e9507372f0b1b963James DongDeltaVal2   RN  11
3590c1bc742181ded4930842b46e9507372f0b1b963James DongPredVal     RN  8
3600c1bc742181ded4930842b46e9507372f0b1b963James DongtmpDeltaVal RN  6
3610c1bc742181ded4930842b46e9507372f0b1b963James Dongsum1        RN  12
3620c1bc742181ded4930842b46e9507372f0b1b963James Dongsum2        RN  14
3630c1bc742181ded4930842b46e9507372f0b1b963James Dong
3640c1bc742181ded4930842b46e9507372f0b1b963James Dong
3650c1bc742181ded4930842b46e9507372f0b1b963James Dong
3660c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Allocate stack memory required by the function
3670c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ALLOC8 pBuffer, 32
3680c1bc742181ded4930842b46e9507372f0b1b963James Dong
3690c1bc742181ded4930842b46e9507372f0b1b963James Dong
3700c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Write function header
3710c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11
3720c1bc742181ded4930842b46e9507372f0b1b963James Dong
3730c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Define stack arguments
3740c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ARG   predStepOnStack, 4
3750c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ARG   dstStepOnStack,4
3760c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ARG   QPOnStack, 4
3770c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ARG   ACOnStack,4
3780c1bc742181ded4930842b46e9507372f0b1b963James Dong
3790c1bc742181ded4930842b46e9507372f0b1b963James Dong
3800c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ADR   pDelta,pBuffer
3810c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   AC,ACOnStack
3820c1bc742181ded4930842b46e9507372f0b1b963James Dong
3830c1bc742181ded4930842b46e9507372f0b1b963James Dong
3840c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Save registers r1,r2,r3 before function call
3850c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pPredTemp,pPred
3860c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pDCTemp,pDC
3870c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pDstTemp,pDst
3880c1bc742181ded4930842b46e9507372f0b1b963James Dong
3890c1bc742181ded4930842b46e9507372f0b1b963James Dong        CMP     AC,#0
3900c1bc742181ded4930842b46e9507372f0b1b963James Dong        BEQ     DCcase
3910c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4
3920c1bc742181ded4930842b46e9507372f0b1b963James Dong
3930c1bc742181ded4930842b46e9507372f0b1b963James Dong        BL      armVCM4P10_UnpackBlock4x4
3940c1bc742181ded4930842b46e9507372f0b1b963James Dong
3950c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   QP,QPOnStack                                ;// Set up r1 for DequantLumaAC4x4
3960c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pDeltaArg0,pDelta                           ;// Set up r0 for DequantLumaAC4x4
3970c1bc742181ded4930842b46e9507372f0b1b963James Dong
3980c1bc742181ded4930842b46e9507372f0b1b963James Dong        BL      armVCM4P10_DequantLumaAC4x4
3990c1bc742181ded4930842b46e9507372f0b1b963James Dong
4000c1bc742181ded4930842b46e9507372f0b1b963James Dong
4010c1bc742181ded4930842b46e9507372f0b1b963James Dong        CMP     pDCTemp,#0
4020c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRSHNE DCval,[pDCTemp]
4030c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pDeltaArg0,pDelta                           ;// Set up r0 for armVCM4P10_TransformResidual4x4
4040c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_TransformResidual4x4
4050c1bc742181ded4930842b46e9507372f0b1b963James Dong        STRHNE  DCval,[pDelta]
4060c1bc742181ded4930842b46e9507372f0b1b963James Dong
4070c1bc742181ded4930842b46e9507372f0b1b963James Dong        BL      armVCM4P10_TransformResidual4x4
4080c1bc742181ded4930842b46e9507372f0b1b963James Dong        B       OutDCcase
4090c1bc742181ded4930842b46e9507372f0b1b963James Dong
4100c1bc742181ded4930842b46e9507372f0b1b963James Dong
4110c1bc742181ded4930842b46e9507372f0b1b963James DongDCcase
4120c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRSH   DCval,[pDCTemp]
4130c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD     DCval,DCval,#32
4140c1bc742181ded4930842b46e9507372f0b1b963James Dong        ASR     DCval,DCval,#6
4150c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   DCval,DCval,DCval,LSL #16                  ;// Duplicating the Lower halfword
4160c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     DCvalCopy, DCval                           ;// Needed for STRD
4170c1bc742181ded4930842b46e9507372f0b1b963James Dong        STRD    DCval, [pDelta, #0]                        ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
4180c1bc742181ded4930842b46e9507372f0b1b963James Dong        STRD    DCval, [pDelta, #8]                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
4190c1bc742181ded4930842b46e9507372f0b1b963James Dong        STRD    DCval, [pDelta, #16]                       ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
4200c1bc742181ded4930842b46e9507372f0b1b963James Dong        STRD    DCval, [pDelta, #24]
4210c1bc742181ded4930842b46e9507372f0b1b963James Dong
4220c1bc742181ded4930842b46e9507372f0b1b963James Dong
4230c1bc742181ded4930842b46e9507372f0b1b963James DongOutDCcase
4240c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   predstep,predStepOnStack
4250c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   dstStep,dstStepOnStack
4260c1bc742181ded4930842b46e9507372f0b1b963James Dong
4270c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDMIA   pDelta!,{tmpDeltaVal,DeltaVal2}             ;// Pre load
4280c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     ycounter,#4                                 ;// Counter for the PredPlusDeltaLoop
4290c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     PredVal,[pPredTemp]                         ;// Pre load
4300c1bc742181ded4930842b46e9507372f0b1b963James Dong
4310c1bc742181ded4930842b46e9507372f0b1b963James DongPredPlusDeltaLoop
4320c1bc742181ded4930842b46e9507372f0b1b963James Dong
4330c1bc742181ded4930842b46e9507372f0b1b963James Dong
4340c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUBS    ycounter,ycounter,#1
4350c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD     pPredTemp,pPredTemp,predstep                ;// Increment pPred ptr
4360c1bc742181ded4930842b46e9507372f0b1b963James Dong
4370c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16     ;// Deltaval1 = [C A]
4380c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16     ;// DeltaVal2 = [D B]
4390c1bc742181ded4930842b46e9507372f0b1b963James Dong
4400c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTB16  PredVal1,PredVal                            ;// PredVal1 = [0c0a]
4410c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTB16  PredVal2,PredVal,ROR #8                     ;// PredVal2 = [0d0b]
4420c1bc742181ded4930842b46e9507372f0b1b963James Dong
4430c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRGT   PredVal,[pPredTemp]                         ;// Pre load
4440c1bc742181ded4930842b46e9507372f0b1b963James Dong
4450c1bc742181ded4930842b46e9507372f0b1b963James Dong        QADD16  sum2,DeltaVal2,PredVal2                     ;// Add and saturate to 16 bits
4460c1bc742181ded4930842b46e9507372f0b1b963James Dong        QADD16  sum1,DeltaVal1,PredVal1
4470c1bc742181ded4930842b46e9507372f0b1b963James Dong
4480c1bc742181ded4930842b46e9507372f0b1b963James Dong        USAT16  sum2,#8,sum2                                ;// armClip(0,255,sum2)
4490c1bc742181ded4930842b46e9507372f0b1b963James Dong        USAT16  sum1,#8,sum1
4500c1bc742181ded4930842b46e9507372f0b1b963James Dong
4510c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDMGTIA   pDelta!,{tmpDeltaVal,DeltaVal2}           ;// Pre load
4520c1bc742181ded4930842b46e9507372f0b1b963James Dong
4530c1bc742181ded4930842b46e9507372f0b1b963James Dong        ORR     sum1,sum1,sum2,LSL #8                       ;// sum1 = [dcba]
4540c1bc742181ded4930842b46e9507372f0b1b963James Dong        STR     sum1,[pDstTemp]
4550c1bc742181ded4930842b46e9507372f0b1b963James Dong
4560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD     pDstTemp,pDstTemp,dstStep                   ;// Increment pDst ptr
4570c1bc742181ded4930842b46e9507372f0b1b963James Dong        BGT     PredPlusDeltaLoop
4580c1bc742181ded4930842b46e9507372f0b1b963James Dong
4590c1bc742181ded4930842b46e9507372f0b1b963James Dong
4600c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Set return value
4610c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     result,#OMX_Sts_NoErr
4620c1bc742181ded4930842b46e9507372f0b1b963James Dong
4630c1bc742181ded4930842b46e9507372f0b1b963James DongEnd
4640c1bc742181ded4930842b46e9507372f0b1b963James Dong
4650c1bc742181ded4930842b46e9507372f0b1b963James Dong
4660c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function tail
4670c1bc742181ded4930842b46e9507372f0b1b963James Dong
4680c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_END
4690c1bc742181ded4930842b46e9507372f0b1b963James Dong
4700c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF                                                    ;//ARM1136JS
4710c1bc742181ded4930842b46e9507372f0b1b963James Dong
4720c1bc742181ded4930842b46e9507372f0b1b963James Dong
4730c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
4740c1bc742181ded4930842b46e9507372f0b1b963James Dong
4750c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
4760c1bc742181ded4930842b46e9507372f0b1b963James Dong
4770c1bc742181ded4930842b46e9507372f0b1b963James Dong
4780c1bc742181ded4930842b46e9507372f0b1b963James Dong
4790c1bc742181ded4930842b46e9507372f0b1b963James Dong
4800c1bc742181ded4930842b46e9507372f0b1b963James Dong    END
481