10c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Copyright (C) 2007-2008 ARM Limited
378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Licensed under the Apache License, Version 2.0 (the "License");
578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// you may not use this file except in compliance with the License.
678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// You may obtain a copy of the License at
778e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
878e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//      http://www.apache.org/licenses/LICENSE-2.0
978e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
1078e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Unless required by applicable law or agreed to in writing, software
1178e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// distributed under the License is distributed on an "AS IS" BASIS,
1278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// See the License for the specific language governing permissions and
1478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// limitations under the License.
1578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
1678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
170c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// File Name:  armVCM4P10_TransformResidual4x4_s.s
190c1bc742181ded4930842b46e9507372f0b1b963James Dong;// OpenMAX DL: v1.0.2
200c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Revision:   12290
210c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Date:       Wednesday, April 9, 2008
220c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
230c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
240c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
250c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Description:
270c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transform Residual 4x4 Coefficients
280c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
290c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
300c1bc742181ded4930842b46e9507372f0b1b963James Dong
310c1bc742181ded4930842b46e9507372f0b1b963James Dong
320c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Include standard headers
330c1bc742181ded4930842b46e9507372f0b1b963James Dong
340c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE omxtypes_s.h
350c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE armCOMM_s.h
360c1bc742181ded4930842b46e9507372f0b1b963James Dong
370c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_VARIANTS CortexA8
380c1bc742181ded4930842b46e9507372f0b1b963James Dong
390c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Import symbols required from other files
400c1bc742181ded4930842b46e9507372f0b1b963James Dong;// (For example tables)
410c1bc742181ded4930842b46e9507372f0b1b963James Dong
420c1bc742181ded4930842b46e9507372f0b1b963James Dong
430c1bc742181ded4930842b46e9507372f0b1b963James Dong
440c1bc742181ded4930842b46e9507372f0b1b963James Dong
450c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Set debugging level
460c1bc742181ded4930842b46e9507372f0b1b963James Dong;//DEBUG_ON    SETL {TRUE}
470c1bc742181ded4930842b46e9507372f0b1b963James Dong
480c1bc742181ded4930842b46e9507372f0b1b963James Dong
490c1bc742181ded4930842b46e9507372f0b1b963James Dong
500c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
510c1bc742181ded4930842b46e9507372f0b1b963James Dong
520c1bc742181ded4930842b46e9507372f0b1b963James Dong
530c1bc742181ded4930842b46e9507372f0b1b963James Dong
540c1bc742181ded4930842b46e9507372f0b1b963James Dong
550c1bc742181ded4930842b46e9507372f0b1b963James Dong
560c1bc742181ded4930842b46e9507372f0b1b963James Dong
570c1bc742181ded4930842b46e9507372f0b1b963James Dong
580c1bc742181ded4930842b46e9507372f0b1b963James Dong
590c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
600c1bc742181ded4930842b46e9507372f0b1b963James Dong
610c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF  CortexA8
620c1bc742181ded4930842b46e9507372f0b1b963James Dong
630c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ARM Registers
640c1bc742181ded4930842b46e9507372f0b1b963James Dong
650c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Input Registers
660c1bc742181ded4930842b46e9507372f0b1b963James DongpDst                RN  0
670c1bc742181ded4930842b46e9507372f0b1b963James DongpSrc                RN  1
680c1bc742181ded4930842b46e9507372f0b1b963James Dong
690c1bc742181ded4930842b46e9507372f0b1b963James Dong
700c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Neon Registers
710c1bc742181ded4930842b46e9507372f0b1b963James Dong
720c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Packed Input pixels
730c1bc742181ded4930842b46e9507372f0b1b963James DongdIn0                DN  D0.S16
740c1bc742181ded4930842b46e9507372f0b1b963James DongdIn1                DN  D1.S16
750c1bc742181ded4930842b46e9507372f0b1b963James DongdIn2                DN  D2.S16
760c1bc742181ded4930842b46e9507372f0b1b963James DongdIn3                DN  D3.S16
770c1bc742181ded4930842b46e9507372f0b1b963James Dong
780c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Intermediate calculations
790c1bc742181ded4930842b46e9507372f0b1b963James DongdZero               DN  D4.S16
800c1bc742181ded4930842b46e9507372f0b1b963James Dongde0                 DN  D5.S16
810c1bc742181ded4930842b46e9507372f0b1b963James Dongde1                 DN  D6.S16
820c1bc742181ded4930842b46e9507372f0b1b963James Dongde2                 DN  D7.S16
830c1bc742181ded4930842b46e9507372f0b1b963James Dongde3                 DN  D8.S16
840c1bc742181ded4930842b46e9507372f0b1b963James DongdIn1RS              DN  D7.S16
850c1bc742181ded4930842b46e9507372f0b1b963James DongdIn3RS              DN  D8.S16
860c1bc742181ded4930842b46e9507372f0b1b963James Dongdf0                 DN  D0.S16
870c1bc742181ded4930842b46e9507372f0b1b963James Dongdf1                 DN  D1.S16
880c1bc742181ded4930842b46e9507372f0b1b963James Dongdf2                 DN  D2.S16
890c1bc742181ded4930842b46e9507372f0b1b963James Dongdf3                 DN  D3.S16
900c1bc742181ded4930842b46e9507372f0b1b963James Dongqf01                QN  Q0.32
910c1bc742181ded4930842b46e9507372f0b1b963James Dongqf23                QN  Q1.32
920c1bc742181ded4930842b46e9507372f0b1b963James Dongdg0                 DN  D5.S16
930c1bc742181ded4930842b46e9507372f0b1b963James Dongdg1                 DN  D6.S16
940c1bc742181ded4930842b46e9507372f0b1b963James Dongdg2                 DN  D7.S16
950c1bc742181ded4930842b46e9507372f0b1b963James Dongdg3                 DN  D8.S16
960c1bc742181ded4930842b46e9507372f0b1b963James Dongdf1RS               DN  D7.S16
970c1bc742181ded4930842b46e9507372f0b1b963James Dongdf3RS               DN  D8.S16
980c1bc742181ded4930842b46e9507372f0b1b963James Dong
990c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Output pixels
1000c1bc742181ded4930842b46e9507372f0b1b963James Dongdh0                 DN  D0.S16
1010c1bc742181ded4930842b46e9507372f0b1b963James Dongdh1                 DN  D1.S16
1020c1bc742181ded4930842b46e9507372f0b1b963James Dongdh2                 DN  D2.S16
1030c1bc742181ded4930842b46e9507372f0b1b963James Dongdh3                 DN  D3.S16
1040c1bc742181ded4930842b46e9507372f0b1b963James Dong
1050c1bc742181ded4930842b46e9507372f0b1b963James Dong
1060c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Allocate stack memory required by the function
1070c1bc742181ded4930842b46e9507372f0b1b963James Dong
1080c1bc742181ded4930842b46e9507372f0b1b963James Dong
1090c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Write function header
1100c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_START armVCM4P10_TransformResidual4x4, ,d8
1110c1bc742181ded4930842b46e9507372f0b1b963James Dong
1120c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;******************************************************************
1130c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// The strategy used in implementing the transform is as follows:*
1140c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load the 4x4 block into 8 registers                           *
1150c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose the 4x4 matrix                                      *
1160c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Perform the row operations (on columns) using SIMD            *
1170c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose the 4x4 result matrix                               *
1180c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Perform the coloumn operations                                *
1190c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Store the 4x4 block at one go                                 *
1200c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;******************************************************************
1210c1bc742181ded4930842b46e9507372f0b1b963James Dong
1220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load all the 4x4 pixels in transposed form
1230c1bc742181ded4930842b46e9507372f0b1b963James Dong
1240c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD4    {dIn0,dIn1,dIn2,dIn3},[pSrc]
1250c1bc742181ded4930842b46e9507372f0b1b963James Dong
1260c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMOV    dZero,#0                                    ;// Used to right shift by 1
1270c1bc742181ded4930842b46e9507372f0b1b963James Dong
1280c1bc742181ded4930842b46e9507372f0b1b963James Dong
1290c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;****************************************
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row Operations (Performed on columns)
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;****************************************
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       dIn3RS,dIn3,dZero
1380c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1)
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        df0,de0,de3                         ;//  f0 = e0 + e3
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        df1,de1,de2                            ;//  f1 = e1 + e2
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        df3,de0,de3                            ;//  f3 = e0 - e3
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*****************************************************************
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose the resultant matrix
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*****************************************************************
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN    df0,df1
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN    df2,df3
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN    qf01,qf23
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*******************************
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Coloumn Operations
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*******************************
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        dg0,df0,df2                         ;//  e0 = d0 + d2
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       df3RS,df3,dZero
1650c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1)
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong
1720c1bc742181ded4930842b46e9507372f0b1b963James Dong
1730c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;************************************************
1740c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Calculate final value (colOp[i][j] + 32)>>6
1750c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;************************************************
1760c1bc742181ded4930842b46e9507372f0b1b963James Dong
1770c1bc742181ded4930842b46e9507372f0b1b963James Dong        VRSHR       dh0,#6
1780c1bc742181ded4930842b46e9507372f0b1b963James Dong        VRSHR       dh1,#6
1790c1bc742181ded4930842b46e9507372f0b1b963James Dong        VRSHR       dh2,#6
1800c1bc742181ded4930842b46e9507372f0b1b963James Dong        VRSHR       dh3,#6
1810c1bc742181ded4930842b46e9507372f0b1b963James Dong
1820c1bc742181ded4930842b46e9507372f0b1b963James Dong
1830c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;***************************
1840c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Store all the 4x4 pixels
1850c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;***************************
1860c1bc742181ded4930842b46e9507372f0b1b963James Dong
1870c1bc742181ded4930842b46e9507372f0b1b963James Dong        VST1   {dh0,dh1,dh2,dh3},[pDst]
1880c1bc742181ded4930842b46e9507372f0b1b963James Dong
1890c1bc742181ded4930842b46e9507372f0b1b963James Dong
1900c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Set return value
1910c1bc742181ded4930842b46e9507372f0b1b963James Dong
1920c1bc742181ded4930842b46e9507372f0b1b963James DongEnd
1930c1bc742181ded4930842b46e9507372f0b1b963James Dong
1940c1bc742181ded4930842b46e9507372f0b1b963James Dong
1950c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function tail
1960c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_END
1970c1bc742181ded4930842b46e9507372f0b1b963James Dong
1980c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF                                                           ;//CortexA8
1990c1bc742181ded4930842b46e9507372f0b1b963James Dong
20078e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar    END
201