10c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Copyright (C) 2007-2008 ARM Limited
378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Licensed under the Apache License, Version 2.0 (the "License");
578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// you may not use this file except in compliance with the License.
678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// You may obtain a copy of the License at
778e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
878e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//      http://www.apache.org/licenses/LICENSE-2.0
978e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
1078e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Unless required by applicable law or agreed to in writing, software
1178e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// distributed under the License is distributed on an "AS IS" BASIS,
1278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// See the License for the specific language governing permissions and
1478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// limitations under the License.
1578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
1678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
170c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// File Name:  armVCM4P10_TransformResidual4x4_s.s
190c1bc742181ded4930842b46e9507372f0b1b963James Dong;// OpenMAX DL: v1.0.2
200c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Revision:   9641
210c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Date:       Thursday, February 7, 2008
220c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
230c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
240c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
250c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Description:
270c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transform Residual 4x4 Coefficients
280c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
290c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
300c1bc742181ded4930842b46e9507372f0b1b963James Dong
310c1bc742181ded4930842b46e9507372f0b1b963James Dong
320c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Include standard headers
330c1bc742181ded4930842b46e9507372f0b1b963James Dong
340c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE omxtypes_s.h
350c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE armCOMM_s.h
360c1bc742181ded4930842b46e9507372f0b1b963James Dong
370c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_VARIANTS ARM1136JS
380c1bc742181ded4930842b46e9507372f0b1b963James Dong
390c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Import symbols required from other files
400c1bc742181ded4930842b46e9507372f0b1b963James Dong;// (For example tables)
410c1bc742181ded4930842b46e9507372f0b1b963James Dong
420c1bc742181ded4930842b46e9507372f0b1b963James Dong
430c1bc742181ded4930842b46e9507372f0b1b963James Dong
440c1bc742181ded4930842b46e9507372f0b1b963James Dong
450c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Set debugging level
460c1bc742181ded4930842b46e9507372f0b1b963James Dong;//DEBUG_ON    SETL {TRUE}
470c1bc742181ded4930842b46e9507372f0b1b963James Dong
480c1bc742181ded4930842b46e9507372f0b1b963James Dong
490c1bc742181ded4930842b46e9507372f0b1b963James Dong
500c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
510c1bc742181ded4930842b46e9507372f0b1b963James Dong
520c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF  ARM1136JS
530c1bc742181ded4930842b46e9507372f0b1b963James Dong
540c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Input Registers
550c1bc742181ded4930842b46e9507372f0b1b963James DongpDst                RN  0
560c1bc742181ded4930842b46e9507372f0b1b963James DongpSrc                RN  1
570c1bc742181ded4930842b46e9507372f0b1b963James Dong
580c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Output Registers
590c1bc742181ded4930842b46e9507372f0b1b963James Dong
600c1bc742181ded4930842b46e9507372f0b1b963James Dong
610c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Local Scratch Registers
620c1bc742181ded4930842b46e9507372f0b1b963James Dong
630c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Packed Input pixels
640c1bc742181ded4930842b46e9507372f0b1b963James Dongin00                RN  2                   ;// Src[0] & Src[1]
650c1bc742181ded4930842b46e9507372f0b1b963James Dongin02                RN  3                   ;// Src[2] & Src[3]
660c1bc742181ded4930842b46e9507372f0b1b963James Dongin10                RN  4                   ;// Src[4] & Src[5]
670c1bc742181ded4930842b46e9507372f0b1b963James Dongin12                RN  5                   ;// Src[6] & Src[7]
680c1bc742181ded4930842b46e9507372f0b1b963James Dongin20                RN  6                   ;// Src[8] & Src[9]
690c1bc742181ded4930842b46e9507372f0b1b963James Dongin22                RN  7                   ;// Src[10] & Src[11]
700c1bc742181ded4930842b46e9507372f0b1b963James Dongin30                RN  8                   ;// Src[12] & Src[13]
710c1bc742181ded4930842b46e9507372f0b1b963James Dongin32                RN  9                   ;// Src[14] & Src[15]
720c1bc742181ded4930842b46e9507372f0b1b963James Dong
730c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transpose for Row operations (Rows to cols)
740c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow00             RN  2
750c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow10             RN  10
760c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow02             RN  3
770c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow12             RN  5
780c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow20             RN  11
790c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow30             RN  12
800c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow32             RN  14
810c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow22             RN  7
820c1bc742181ded4930842b46e9507372f0b1b963James Dong
830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Intermediate calculations
840c1bc742181ded4930842b46e9507372f0b1b963James Donge0                  RN  4
850c1bc742181ded4930842b46e9507372f0b1b963James Donge1                  RN  6
860c1bc742181ded4930842b46e9507372f0b1b963James Donge2                  RN  8
870c1bc742181ded4930842b46e9507372f0b1b963James Donge3                  RN  9
880c1bc742181ded4930842b46e9507372f0b1b963James DongconstZero           RN  1
890c1bc742181ded4930842b46e9507372f0b1b963James Dong
900c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Row operated pixels
910c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp00             RN  2
920c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp10             RN  10
930c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp20             RN  11
940c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp30             RN  12
950c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp02             RN  3
960c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp12             RN  5
970c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp22             RN  7
980c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp32             RN  14
990c1bc742181ded4930842b46e9507372f0b1b963James Dong
1000c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transpose for colulmn operations
1010c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol00             RN  2
1020c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol02             RN  3
1030c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol10             RN  4
1040c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol12             RN  5
1050c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol20             RN  6
1060c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol22             RN  7
1070c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol30             RN  8
1080c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol32             RN  9
1090c1bc742181ded4930842b46e9507372f0b1b963James Dong
1100c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Intermediate calculations
1110c1bc742181ded4930842b46e9507372f0b1b963James Dongg0                  RN  10
1120c1bc742181ded4930842b46e9507372f0b1b963James Dongg1                  RN  11
1130c1bc742181ded4930842b46e9507372f0b1b963James Dongg2                  RN  12
1140c1bc742181ded4930842b46e9507372f0b1b963James Dongg3                  RN  14
1150c1bc742181ded4930842b46e9507372f0b1b963James Dong
1160c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Coloumn operated pixels
1170c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp00             RN  2
1180c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp02             RN  3
1190c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp10             RN  4
1200c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp12             RN  5
1210c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp20             RN  6
1220c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp22             RN  7
1230c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp30             RN  8
1240c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp32             RN  9
1250c1bc742181ded4930842b46e9507372f0b1b963James Dong
1260c1bc742181ded4930842b46e9507372f0b1b963James Dong
1270c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp1               RN  10                  ;// Temporary scratch varaibles
1280c1bc742181ded4930842b46e9507372f0b1b963James Dongconst1              RN  11
1290c1bc742181ded4930842b46e9507372f0b1b963James Dongconst2              RN  12
1300c1bc742181ded4930842b46e9507372f0b1b963James Dongmask                RN  14
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Output pixels
1330c1bc742181ded4930842b46e9507372f0b1b963James Dongout00               RN  2
1340c1bc742181ded4930842b46e9507372f0b1b963James Dongout02               RN  3
1350c1bc742181ded4930842b46e9507372f0b1b963James Dongout10               RN  4
1360c1bc742181ded4930842b46e9507372f0b1b963James Dongout12               RN  5
1370c1bc742181ded4930842b46e9507372f0b1b963James Dongout20               RN  6
1380c1bc742181ded4930842b46e9507372f0b1b963James Dongout22               RN  7
1390c1bc742181ded4930842b46e9507372f0b1b963James Dongout30               RN  8
1400c1bc742181ded4930842b46e9507372f0b1b963James Dongout32               RN  9
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Allocate stack memory required by the function
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Write function header
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_START armVCM4P10_TransformResidual4x4,r11
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;******************************************************************
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// The strategy used in implementing the transform is as follows:*
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load the 4x4 block into 8 registers                           *
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose the 4x4 matrix                                      *
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Perform the row operations (on columns) using SIMD            *
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose the 4x4 result matrix                               *
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Perform the coloumn operations                                *
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Store the 4x4 block at one go                                 *
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;******************************************************************
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load all the 4x4 pixels
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDMIA   pSrc,{in00,in02,in10,in12,in20,in22,in30,in32}
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV       constZero,#0                                     ;// Used to right shift by 1
1650c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;LDR       constZero,=0x00000000
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*****************************************************************
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose the matrix inorder to perform row ops as coloumn ops
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input:   in[][] = original matrix
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output:  trRow[][]= transposed matrix
1720c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Step1: Obtain the LL part of the transposed matrix
1730c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Step2: Obtain the HL part
1740c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// step3: Obtain the LH part
1750c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Step4: Obtain the HH part
1760c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1770c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*****************************************************************
1780c1bc742181ded4930842b46e9507372f0b1b963James Dong
1790c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// LL 2x2 transposed matrix
1800c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   d0 d1 - -
1810c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   d4 d5 - -
1820c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   -  -  - -
1830c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   -  -  - -
1840c1bc742181ded4930842b46e9507372f0b1b963James Dong
1850c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]
1860c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]
1870c1bc742181ded4930842b46e9507372f0b1b963James Dong
1880c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// HL 2x2 transposed matrix
1890c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    -   -   - -
1900c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    -   -   - -
1910c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    d8  d9  - -
1920c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   d12 d13  - -
1930c1bc742181ded4930842b46e9507372f0b1b963James Dong
1940c1bc742181ded4930842b46e9507372f0b1b963James Dong
1950c1bc742181ded4930842b46e9507372f0b1b963James Dong         PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
1960c1bc742181ded4930842b46e9507372f0b1b963James Dong         PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2]
1970c1bc742181ded4930842b46e9507372f0b1b963James Dong
1980c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// LH 2x2 transposed matrix
1990c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - d2 d3
2000c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - d6 d7
2010c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - -  -
2020c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - -  -
2030c1bc742181ded4930842b46e9507372f0b1b963James Dong
2040c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]
2050c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9]
2060c1bc742181ded4930842b46e9507372f0b1b963James Dong
2070c1bc742181ded4930842b46e9507372f0b1b963James Dong
2080c1bc742181ded4930842b46e9507372f0b1b963James Dong
2090c1bc742181ded4930842b46e9507372f0b1b963James Dong
2100c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// HH 2x2 transposed matrix
2110c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -   -   -
2120c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -   -   -
2130c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -  d10 d11
2140c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -  d14 d15
2150c1bc742181ded4930842b46e9507372f0b1b963James Dong
2160c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
2170c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]
2180c1bc742181ded4930842b46e9507372f0b1b963James Dong
2190c1bc742181ded4930842b46e9507372f0b1b963James Dong
2200c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;****************************************
2210c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row Operations (Performed on columns)
2220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;****************************************
2230c1bc742181ded4930842b46e9507372f0b1b963James Dong
2240c1bc742181ded4930842b46e9507372f0b1b963James Dong
2250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// SIMD operations on first two columns(two rows of the original matrix)
2260c1bc742181ded4930842b46e9507372f0b1b963James Dong
2270c1bc742181ded4930842b46e9507372f0b1b963James Dong
2280c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16      e0, trRow00,trRow20                   ;//  e0 = d0 + d2
2290c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    e1, trRow00,trRow20                   ;//  e1 = d0 - d2
2300c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   e2, trRow10,constZero                 ;// (f1>>1) constZero is a register holding 0
2310c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   e3, trRow30,constZero                 ;//  avoid pipeline stalls for e2 and e3
2320c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    e2, e2, trRow30                       ;//  e2 = (d1>>1) - d3
2330c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    e3, e3, trRow10                       ;//  e3 = d1 + (d3>>1)
2340c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    rowOp00, e0, e3                       ;//  f0 = e0 + e3
2350c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    rowOp10, e1, e2                       ;//  f1 = e1 + e2
2360c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    rowOp20, e1, e2                       ;//  f2 = e1 - e2
2370c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    rowOp30, e0, e3                       ;//  f3 = e0 - e3
2380c1bc742181ded4930842b46e9507372f0b1b963James Dong
2390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// SIMD operations on next two columns(next two rows of the original matrix)
2400c1bc742181ded4930842b46e9507372f0b1b963James Dong
2410c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16      e0, trRow02,trRow22
2420c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    e1, trRow02,trRow22
2430c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   e2, trRow12,constZero                 ;//(f1>>1) constZero is a register holding 0
2440c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   e3, trRow32,constZero
2450c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    e2, e2, trRow32
2460c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    e3, e3, trRow12
2470c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    rowOp02, e0, e3
2480c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    rowOp12, e1, e2
2490c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    rowOp22, e1, e2
2500c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    rowOp32, e0, e3
2510c1bc742181ded4930842b46e9507372f0b1b963James Dong
2520c1bc742181ded4930842b46e9507372f0b1b963James Dong
2530c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*****************************************************************
2540c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose the resultant matrix
2550c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input:  rowOp[][]
2560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output: trCol[][]
2570c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*****************************************************************
2580c1bc742181ded4930842b46e9507372f0b1b963James Dong
2590c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// LL 2x2 transposed matrix
2600c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   d0 d1 - -
2610c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   d4 d5 - -
2620c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   -  -  - -
2630c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   -  -  - -
2640c1bc742181ded4930842b46e9507372f0b1b963James Dong
2650c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
2660c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]
2670c1bc742181ded4930842b46e9507372f0b1b963James Dong
2680c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// HL 2x2 transposed matrix
2690c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    -   -   - -
2700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    -   -   - -
2710c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    d8  d9  - -
2720c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   d12 d13  - -
2730c1bc742181ded4930842b46e9507372f0b1b963James Dong
2740c1bc742181ded4930842b46e9507372f0b1b963James Dong
2750c1bc742181ded4930842b46e9507372f0b1b963James Dong         PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
2760c1bc742181ded4930842b46e9507372f0b1b963James Dong         PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2]
2770c1bc742181ded4930842b46e9507372f0b1b963James Dong
2780c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// LH 2x2 transposed matrix
2790c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - d2 d3
2800c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - d6 d7
2810c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - -  -
2820c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - -  -
2830c1bc742181ded4930842b46e9507372f0b1b963James Dong
2840c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]
2850c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9]
2860c1bc742181ded4930842b46e9507372f0b1b963James Dong
2870c1bc742181ded4930842b46e9507372f0b1b963James Dong
2880c1bc742181ded4930842b46e9507372f0b1b963James Dong
2890c1bc742181ded4930842b46e9507372f0b1b963James Dong
2900c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// HH 2x2 transposed matrix
2910c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -   -   -
2920c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -   -   -
2930c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -  d10 d11
2940c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -  d14 d15
2950c1bc742181ded4930842b46e9507372f0b1b963James Dong
2960c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
2970c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]
2980c1bc742181ded4930842b46e9507372f0b1b963James Dong
2990c1bc742181ded4930842b46e9507372f0b1b963James Dong
3000c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*******************************
3010c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Coloumn Operations
3020c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*******************************
3030c1bc742181ded4930842b46e9507372f0b1b963James Dong
3040c1bc742181ded4930842b46e9507372f0b1b963James Dong
3050c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// SIMD operations on first two columns
3060c1bc742181ded4930842b46e9507372f0b1b963James Dong
3070c1bc742181ded4930842b46e9507372f0b1b963James Dong
3080c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16      g0, trCol00,trCol20
3090c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    g1, trCol00,trCol20
3100c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   g2, trCol10,constZero                     ;// (f1>>1) constZero is a register holding 0
3110c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   g3, trCol30,constZero
3120c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    g2, g2, trCol30
3130c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    g3, g3, trCol10
3140c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp00, g0, g3
3150c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp10, g1, g2
3160c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    colOp20, g1, g2
3170c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    colOp30, g0, g3
3180c1bc742181ded4930842b46e9507372f0b1b963James Dong
3190c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// SIMD operations on next two columns
3200c1bc742181ded4930842b46e9507372f0b1b963James Dong
3210c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16      g0, trCol02,trCol22
3220c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    g1, trCol02,trCol22
3230c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   g2, trCol12,constZero                     ;// (f1>>1) constZero is a register holding 0
3240c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   g3, trCol32,constZero
3250c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    g2, g2, trCol32
3260c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    g3, g3, trCol12
3270c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp02, g0, g3
3280c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp12, g1, g2
3290c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    colOp22, g1, g2
3300c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    colOp32, g0, g3
3310c1bc742181ded4930842b46e9507372f0b1b963James Dong
3320c1bc742181ded4930842b46e9507372f0b1b963James Dong
3330c1bc742181ded4930842b46e9507372f0b1b963James Dong
3340c1bc742181ded4930842b46e9507372f0b1b963James Dong
3350c1bc742181ded4930842b46e9507372f0b1b963James Dong
3360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;************************************************
3370c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Calculate final value (colOp[i][j] + 32)>>6
3380c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;************************************************
3390c1bc742181ded4930842b46e9507372f0b1b963James Dong
3400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// const1: Serves dual purpose
3410c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result
3420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768)
3430c1bc742181ded4930842b46e9507372f0b1b963James Dong
3440c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     const1, =0x00208020
3450c1bc742181ded4930842b46e9507372f0b1b963James Dong
3460c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     mask, =0xffff03ff                       ;// Used to mask the down shifted 6 bits
3470c1bc742181ded4930842b46e9507372f0b1b963James Dong
3480c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// const2(#512): used to convert the lower 16bit number back to signed value
3490c1bc742181ded4930842b46e9507372f0b1b963James Dong
3500c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     const2,#0x200                           ;// const2 = 2^9
3510c1bc742181ded4930842b46e9507372f0b1b963James Dong
3520c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// First Row
3530c1bc742181ded4930842b46e9507372f0b1b963James Dong
3540c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp00, colOp00, const1
3550c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp02, colOp02, const1
3560c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp00, mask, colOp00, ASR #6
3570c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp02, mask, colOp02, ASR #6
3580c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out00,colOp00,const2
3590c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out02,colOp02,const2
3600c1bc742181ded4930842b46e9507372f0b1b963James Dong
3610c1bc742181ded4930842b46e9507372f0b1b963James Dong
3620c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Second Row
3630c1bc742181ded4930842b46e9507372f0b1b963James Dong
3640c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp10, colOp10, const1
3650c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp12, colOp12, const1
3660c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp10, mask, colOp10, ASR #6
3670c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp12, mask, colOp12, ASR #6
3680c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out10,colOp10,const2
3690c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out12,colOp12,const2
3700c1bc742181ded4930842b46e9507372f0b1b963James Dong
3710c1bc742181ded4930842b46e9507372f0b1b963James Dong
3720c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Third Row
3730c1bc742181ded4930842b46e9507372f0b1b963James Dong
3740c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp20, colOp20, const1
3750c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp22, colOp22, const1
3760c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp20, mask, colOp20, ASR #6
3770c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp22, mask, colOp22, ASR #6
3780c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out20,colOp20,const2
3790c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out22,colOp22,const2
3800c1bc742181ded4930842b46e9507372f0b1b963James Dong
3810c1bc742181ded4930842b46e9507372f0b1b963James Dong
3820c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Fourth Row
3830c1bc742181ded4930842b46e9507372f0b1b963James Dong
3840c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp30, colOp30, const1
3850c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp32, colOp32, const1
3860c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp30, mask, colOp30, ASR #6
3870c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp32, mask, colOp32, ASR #6
3880c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out30,colOp30,const2
3890c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out32,colOp32,const2
3900c1bc742181ded4930842b46e9507372f0b1b963James Dong
3910c1bc742181ded4930842b46e9507372f0b1b963James Dong
3920c1bc742181ded4930842b46e9507372f0b1b963James Dong
3930c1bc742181ded4930842b46e9507372f0b1b963James Dong
3940c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;***************************
3950c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Store all the 4x4 pixels
3960c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;***************************
3970c1bc742181ded4930842b46e9507372f0b1b963James Dong
3980c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDst,{out00,out02,out10,out12,out20,out22,out30,out32}
3990c1bc742181ded4930842b46e9507372f0b1b963James Dong
4000c1bc742181ded4930842b46e9507372f0b1b963James Dong
4010c1bc742181ded4930842b46e9507372f0b1b963James Dong
4020c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Set return value
4030c1bc742181ded4930842b46e9507372f0b1b963James Dong
4040c1bc742181ded4930842b46e9507372f0b1b963James DongEnd
4050c1bc742181ded4930842b46e9507372f0b1b963James Dong
4060c1bc742181ded4930842b46e9507372f0b1b963James Dong
4070c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function tail
4080c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_END
4090c1bc742181ded4930842b46e9507372f0b1b963James Dong
4100c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF                                                           ;//ARM1136JS
4110c1bc742181ded4930842b46e9507372f0b1b963James Dong
4120c1bc742181ded4930842b46e9507372f0b1b963James Dong
4130c1bc742181ded4930842b46e9507372f0b1b963James Dong
4140c1bc742181ded4930842b46e9507372f0b1b963James Dong
4150c1bc742181ded4930842b46e9507372f0b1b963James Dong
4160c1bc742181ded4930842b46e9507372f0b1b963James Dong
4170c1bc742181ded4930842b46e9507372f0b1b963James Dong
4180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
4190c1bc742181ded4930842b46e9507372f0b1b963James Dong
4200c1bc742181ded4930842b46e9507372f0b1b963James Dong
42178e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar    END
422