10c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
20c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
30c1bc742181ded4930842b46e9507372f0b1b963James Dong;// File Name:  armVCM4P10_TransformResidual4x4_s.s
40c1bc742181ded4930842b46e9507372f0b1b963James Dong;// OpenMAX DL: v1.0.2
50c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Revision:   9641
60c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Date:       Thursday, February 7, 2008
70c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
80c1bc742181ded4930842b46e9507372f0b1b963James Dong;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
90c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
100c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
110c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
120c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Description:
130c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transform Residual 4x4 Coefficients
140c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
150c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
160c1bc742181ded4930842b46e9507372f0b1b963James Dong
170c1bc742181ded4930842b46e9507372f0b1b963James Dong
180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Include standard headers
190c1bc742181ded4930842b46e9507372f0b1b963James Dong
200c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE omxtypes_s.h
210c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE armCOMM_s.h
220c1bc742181ded4930842b46e9507372f0b1b963James Dong
230c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_VARIANTS ARM1136JS
240c1bc742181ded4930842b46e9507372f0b1b963James Dong
250c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Import symbols required from other files
260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// (For example tables)
270c1bc742181ded4930842b46e9507372f0b1b963James Dong
280c1bc742181ded4930842b46e9507372f0b1b963James Dong
290c1bc742181ded4930842b46e9507372f0b1b963James Dong
300c1bc742181ded4930842b46e9507372f0b1b963James Dong
310c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Set debugging level
320c1bc742181ded4930842b46e9507372f0b1b963James Dong;//DEBUG_ON    SETL {TRUE}
330c1bc742181ded4930842b46e9507372f0b1b963James Dong
340c1bc742181ded4930842b46e9507372f0b1b963James Dong
350c1bc742181ded4930842b46e9507372f0b1b963James Dong
360c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
370c1bc742181ded4930842b46e9507372f0b1b963James Dong
380c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF  ARM1136JS
390c1bc742181ded4930842b46e9507372f0b1b963James Dong
400c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Input Registers
410c1bc742181ded4930842b46e9507372f0b1b963James DongpDst                RN  0
420c1bc742181ded4930842b46e9507372f0b1b963James DongpSrc                RN  1
430c1bc742181ded4930842b46e9507372f0b1b963James Dong
440c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Output Registers
450c1bc742181ded4930842b46e9507372f0b1b963James Dong
460c1bc742181ded4930842b46e9507372f0b1b963James Dong
470c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Local Scratch Registers
480c1bc742181ded4930842b46e9507372f0b1b963James Dong
490c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Packed Input pixels
500c1bc742181ded4930842b46e9507372f0b1b963James Dongin00                RN  2                   ;// Src[0] & Src[1]
510c1bc742181ded4930842b46e9507372f0b1b963James Dongin02                RN  3                   ;// Src[2] & Src[3]
520c1bc742181ded4930842b46e9507372f0b1b963James Dongin10                RN  4                   ;// Src[4] & Src[5]
530c1bc742181ded4930842b46e9507372f0b1b963James Dongin12                RN  5                   ;// Src[6] & Src[7]
540c1bc742181ded4930842b46e9507372f0b1b963James Dongin20                RN  6                   ;// Src[8] & Src[9]
550c1bc742181ded4930842b46e9507372f0b1b963James Dongin22                RN  7                   ;// Src[10] & Src[11]
560c1bc742181ded4930842b46e9507372f0b1b963James Dongin30                RN  8                   ;// Src[12] & Src[13]
570c1bc742181ded4930842b46e9507372f0b1b963James Dongin32                RN  9                   ;// Src[14] & Src[15]
580c1bc742181ded4930842b46e9507372f0b1b963James Dong
590c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transpose for Row operations (Rows to cols)
600c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow00             RN  2
610c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow10             RN  10
620c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow02             RN  3
630c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow12             RN  5
640c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow20             RN  11
650c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow30             RN  12
660c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow32             RN  14
670c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow22             RN  7
680c1bc742181ded4930842b46e9507372f0b1b963James Dong
690c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Intermediate calculations
700c1bc742181ded4930842b46e9507372f0b1b963James Donge0                  RN  4
710c1bc742181ded4930842b46e9507372f0b1b963James Donge1                  RN  6
720c1bc742181ded4930842b46e9507372f0b1b963James Donge2                  RN  8
730c1bc742181ded4930842b46e9507372f0b1b963James Donge3                  RN  9
740c1bc742181ded4930842b46e9507372f0b1b963James DongconstZero           RN  1
750c1bc742181ded4930842b46e9507372f0b1b963James Dong
760c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Row operated pixels
770c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp00             RN  2
780c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp10             RN  10
790c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp20             RN  11
800c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp30             RN  12
810c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp02             RN  3
820c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp12             RN  5
830c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp22             RN  7
840c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp32             RN  14
850c1bc742181ded4930842b46e9507372f0b1b963James Dong
860c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transpose for colulmn operations
870c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol00             RN  2
880c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol02             RN  3
890c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol10             RN  4
900c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol12             RN  5
910c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol20             RN  6
920c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol22             RN  7
930c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol30             RN  8
940c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol32             RN  9
950c1bc742181ded4930842b46e9507372f0b1b963James Dong
960c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Intermediate calculations
970c1bc742181ded4930842b46e9507372f0b1b963James Dongg0                  RN  10
980c1bc742181ded4930842b46e9507372f0b1b963James Dongg1                  RN  11
990c1bc742181ded4930842b46e9507372f0b1b963James Dongg2                  RN  12
1000c1bc742181ded4930842b46e9507372f0b1b963James Dongg3                  RN  14
1010c1bc742181ded4930842b46e9507372f0b1b963James Dong
1020c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Coloumn operated pixels
1030c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp00             RN  2
1040c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp02             RN  3
1050c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp10             RN  4
1060c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp12             RN  5
1070c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp20             RN  6
1080c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp22             RN  7
1090c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp30             RN  8
1100c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp32             RN  9
1110c1bc742181ded4930842b46e9507372f0b1b963James Dong
1120c1bc742181ded4930842b46e9507372f0b1b963James Dong
1130c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp1               RN  10                  ;// Temporary scratch varaibles
1140c1bc742181ded4930842b46e9507372f0b1b963James Dongconst1              RN  11
1150c1bc742181ded4930842b46e9507372f0b1b963James Dongconst2              RN  12
1160c1bc742181ded4930842b46e9507372f0b1b963James Dongmask                RN  14
1170c1bc742181ded4930842b46e9507372f0b1b963James Dong
1180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Output pixels
1190c1bc742181ded4930842b46e9507372f0b1b963James Dongout00               RN  2
1200c1bc742181ded4930842b46e9507372f0b1b963James Dongout02               RN  3
1210c1bc742181ded4930842b46e9507372f0b1b963James Dongout10               RN  4
1220c1bc742181ded4930842b46e9507372f0b1b963James Dongout12               RN  5
1230c1bc742181ded4930842b46e9507372f0b1b963James Dongout20               RN  6
1240c1bc742181ded4930842b46e9507372f0b1b963James Dongout22               RN  7
1250c1bc742181ded4930842b46e9507372f0b1b963James Dongout30               RN  8
1260c1bc742181ded4930842b46e9507372f0b1b963James Dongout32               RN  9
1270c1bc742181ded4930842b46e9507372f0b1b963James Dong
1280c1bc742181ded4930842b46e9507372f0b1b963James Dong
1290c1bc742181ded4930842b46e9507372f0b1b963James Dong
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Allocate stack memory required by the function
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Write function header
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_START armVCM4P10_TransformResidual4x4,r11
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;******************************************************************
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// The strategy used in implementing the transform is as follows:*
1380c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load the 4x4 block into 8 registers                           *
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose the 4x4 matrix                                      *
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Perform the row operations (on columns) using SIMD            *
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose the 4x4 result matrix                               *
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Perform the coloumn operations                                *
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Store the 4x4 block at one go                                 *
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;******************************************************************
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load all the 4x4 pixels
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDMIA   pSrc,{in00,in02,in10,in12,in20,in22,in30,in32}
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV       constZero,#0                                     ;// Used to right shift by 1
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;LDR       constZero,=0x00000000
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*****************************************************************
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose the matrix inorder to perform row ops as coloumn ops
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input:   in[][] = original matrix
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output:  trRow[][]= transposed matrix
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Step1: Obtain the LL part of the transposed matrix
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Step2: Obtain the HL part
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// step3: Obtain the LH part
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Step4: Obtain the HH part
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*****************************************************************
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong
1650c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// LL 2x2 transposed matrix
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   d0 d1 - -
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   d4 d5 - -
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   -  -  - -
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   -  -  - -
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]
1720c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]
1730c1bc742181ded4930842b46e9507372f0b1b963James Dong
1740c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// HL 2x2 transposed matrix
1750c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    -   -   - -
1760c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    -   -   - -
1770c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    d8  d9  - -
1780c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   d12 d13  - -
1790c1bc742181ded4930842b46e9507372f0b1b963James Dong
1800c1bc742181ded4930842b46e9507372f0b1b963James Dong
1810c1bc742181ded4930842b46e9507372f0b1b963James Dong         PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
1820c1bc742181ded4930842b46e9507372f0b1b963James Dong         PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2]
1830c1bc742181ded4930842b46e9507372f0b1b963James Dong
1840c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// LH 2x2 transposed matrix
1850c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - d2 d3
1860c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - d6 d7
1870c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - -  -
1880c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - -  -
1890c1bc742181ded4930842b46e9507372f0b1b963James Dong
1900c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]
1910c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9]
1920c1bc742181ded4930842b46e9507372f0b1b963James Dong
1930c1bc742181ded4930842b46e9507372f0b1b963James Dong
1940c1bc742181ded4930842b46e9507372f0b1b963James Dong
1950c1bc742181ded4930842b46e9507372f0b1b963James Dong
1960c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// HH 2x2 transposed matrix
1970c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -   -   -
1980c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -   -   -
1990c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -  d10 d11
2000c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -  d14 d15
2010c1bc742181ded4930842b46e9507372f0b1b963James Dong
2020c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
2030c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]
2040c1bc742181ded4930842b46e9507372f0b1b963James Dong
2050c1bc742181ded4930842b46e9507372f0b1b963James Dong
2060c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;****************************************
2070c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Row Operations (Performed on columns)
2080c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;****************************************
2090c1bc742181ded4930842b46e9507372f0b1b963James Dong
2100c1bc742181ded4930842b46e9507372f0b1b963James Dong
2110c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// SIMD operations on first two columns(two rows of the original matrix)
2120c1bc742181ded4930842b46e9507372f0b1b963James Dong
2130c1bc742181ded4930842b46e9507372f0b1b963James Dong
2140c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16      e0, trRow00,trRow20                   ;//  e0 = d0 + d2
2150c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    e1, trRow00,trRow20                   ;//  e1 = d0 - d2
2160c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   e2, trRow10,constZero                 ;// (f1>>1) constZero is a register holding 0
2170c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   e3, trRow30,constZero                 ;//  avoid pipeline stalls for e2 and e3
2180c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    e2, e2, trRow30                       ;//  e2 = (d1>>1) - d3
2190c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    e3, e3, trRow10                       ;//  e3 = d1 + (d3>>1)
2200c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    rowOp00, e0, e3                       ;//  f0 = e0 + e3
2210c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    rowOp10, e1, e2                       ;//  f1 = e1 + e2
2220c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    rowOp20, e1, e2                       ;//  f2 = e1 - e2
2230c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    rowOp30, e0, e3                       ;//  f3 = e0 - e3
2240c1bc742181ded4930842b46e9507372f0b1b963James Dong
2250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// SIMD operations on next two columns(next two rows of the original matrix)
2260c1bc742181ded4930842b46e9507372f0b1b963James Dong
2270c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16      e0, trRow02,trRow22
2280c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    e1, trRow02,trRow22
2290c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   e2, trRow12,constZero                 ;//(f1>>1) constZero is a register holding 0
2300c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   e3, trRow32,constZero
2310c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    e2, e2, trRow32
2320c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    e3, e3, trRow12
2330c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    rowOp02, e0, e3
2340c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    rowOp12, e1, e2
2350c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    rowOp22, e1, e2
2360c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    rowOp32, e0, e3
2370c1bc742181ded4930842b46e9507372f0b1b963James Dong
2380c1bc742181ded4930842b46e9507372f0b1b963James Dong
2390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*****************************************************************
2400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose the resultant matrix
2410c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input:  rowOp[][]
2420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output: trCol[][]
2430c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*****************************************************************
2440c1bc742181ded4930842b46e9507372f0b1b963James Dong
2450c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// LL 2x2 transposed matrix
2460c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   d0 d1 - -
2470c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   d4 d5 - -
2480c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   -  -  - -
2490c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   -  -  - -
2500c1bc742181ded4930842b46e9507372f0b1b963James Dong
2510c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
2520c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]
2530c1bc742181ded4930842b46e9507372f0b1b963James Dong
2540c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// HL 2x2 transposed matrix
2550c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    -   -   - -
2560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    -   -   - -
2570c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    d8  d9  - -
2580c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   d12 d13  - -
2590c1bc742181ded4930842b46e9507372f0b1b963James Dong
2600c1bc742181ded4930842b46e9507372f0b1b963James Dong
2610c1bc742181ded4930842b46e9507372f0b1b963James Dong         PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
2620c1bc742181ded4930842b46e9507372f0b1b963James Dong         PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2]
2630c1bc742181ded4930842b46e9507372f0b1b963James Dong
2640c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// LH 2x2 transposed matrix
2650c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - d2 d3
2660c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - d6 d7
2670c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - -  -
2680c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//   - - -  -
2690c1bc742181ded4930842b46e9507372f0b1b963James Dong
2700c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]
2710c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9]
2720c1bc742181ded4930842b46e9507372f0b1b963James Dong
2730c1bc742181ded4930842b46e9507372f0b1b963James Dong
2740c1bc742181ded4930842b46e9507372f0b1b963James Dong
2750c1bc742181ded4930842b46e9507372f0b1b963James Dong
2760c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// HH 2x2 transposed matrix
2770c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -   -   -
2780c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -   -   -
2790c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -  d10 d11
2800c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//    - -  d14 d15
2810c1bc742181ded4930842b46e9507372f0b1b963James Dong
2820c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
2830c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]
2840c1bc742181ded4930842b46e9507372f0b1b963James Dong
2850c1bc742181ded4930842b46e9507372f0b1b963James Dong
2860c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*******************************
2870c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Coloumn Operations
2880c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;*******************************
2890c1bc742181ded4930842b46e9507372f0b1b963James Dong
2900c1bc742181ded4930842b46e9507372f0b1b963James Dong
2910c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// SIMD operations on first two columns
2920c1bc742181ded4930842b46e9507372f0b1b963James Dong
2930c1bc742181ded4930842b46e9507372f0b1b963James Dong
2940c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16      g0, trCol00,trCol20
2950c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    g1, trCol00,trCol20
2960c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   g2, trCol10,constZero                     ;// (f1>>1) constZero is a register holding 0
2970c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   g3, trCol30,constZero
2980c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    g2, g2, trCol30
2990c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    g3, g3, trCol10
3000c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp00, g0, g3
3010c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp10, g1, g2
3020c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    colOp20, g1, g2
3030c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    colOp30, g0, g3
3040c1bc742181ded4930842b46e9507372f0b1b963James Dong
3050c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// SIMD operations on next two columns
3060c1bc742181ded4930842b46e9507372f0b1b963James Dong
3070c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16      g0, trCol02,trCol22
3080c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    g1, trCol02,trCol22
3090c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   g2, trCol12,constZero                     ;// (f1>>1) constZero is a register holding 0
3100c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16   g3, trCol32,constZero
3110c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    g2, g2, trCol32
3120c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    g3, g3, trCol12
3130c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp02, g0, g3
3140c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp12, g1, g2
3150c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    colOp22, g1, g2
3160c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16    colOp32, g0, g3
3170c1bc742181ded4930842b46e9507372f0b1b963James Dong
3180c1bc742181ded4930842b46e9507372f0b1b963James Dong
3190c1bc742181ded4930842b46e9507372f0b1b963James Dong
3200c1bc742181ded4930842b46e9507372f0b1b963James Dong
3210c1bc742181ded4930842b46e9507372f0b1b963James Dong
3220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;************************************************
3230c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Calculate final value (colOp[i][j] + 32)>>6
3240c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;************************************************
3250c1bc742181ded4930842b46e9507372f0b1b963James Dong
3260c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// const1: Serves dual purpose
3270c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result
3280c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768)
3290c1bc742181ded4930842b46e9507372f0b1b963James Dong
3300c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     const1, =0x00208020
3310c1bc742181ded4930842b46e9507372f0b1b963James Dong
3320c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     mask, =0xffff03ff                       ;// Used to mask the down shifted 6 bits
3330c1bc742181ded4930842b46e9507372f0b1b963James Dong
3340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// const2(#512): used to convert the lower 16bit number back to signed value
3350c1bc742181ded4930842b46e9507372f0b1b963James Dong
3360c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     const2,#0x200                           ;// const2 = 2^9
3370c1bc742181ded4930842b46e9507372f0b1b963James Dong
3380c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// First Row
3390c1bc742181ded4930842b46e9507372f0b1b963James Dong
3400c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp00, colOp00, const1
3410c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp02, colOp02, const1
3420c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp00, mask, colOp00, ASR #6
3430c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp02, mask, colOp02, ASR #6
3440c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out00,colOp00,const2
3450c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out02,colOp02,const2
3460c1bc742181ded4930842b46e9507372f0b1b963James Dong
3470c1bc742181ded4930842b46e9507372f0b1b963James Dong
3480c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Second Row
3490c1bc742181ded4930842b46e9507372f0b1b963James Dong
3500c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp10, colOp10, const1
3510c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp12, colOp12, const1
3520c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp10, mask, colOp10, ASR #6
3530c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp12, mask, colOp12, ASR #6
3540c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out10,colOp10,const2
3550c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out12,colOp12,const2
3560c1bc742181ded4930842b46e9507372f0b1b963James Dong
3570c1bc742181ded4930842b46e9507372f0b1b963James Dong
3580c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Third Row
3590c1bc742181ded4930842b46e9507372f0b1b963James Dong
3600c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp20, colOp20, const1
3610c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp22, colOp22, const1
3620c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp20, mask, colOp20, ASR #6
3630c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp22, mask, colOp22, ASR #6
3640c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out20,colOp20,const2
3650c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out22,colOp22,const2
3660c1bc742181ded4930842b46e9507372f0b1b963James Dong
3670c1bc742181ded4930842b46e9507372f0b1b963James Dong
3680c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Fourth Row
3690c1bc742181ded4930842b46e9507372f0b1b963James Dong
3700c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp30, colOp30, const1
3710c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16    colOp32, colOp32, const1
3720c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp30, mask, colOp30, ASR #6
3730c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND     colOp32, mask, colOp32, ASR #6
3740c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out30,colOp30,const2
3750c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  out32,colOp32,const2
3760c1bc742181ded4930842b46e9507372f0b1b963James Dong
3770c1bc742181ded4930842b46e9507372f0b1b963James Dong
3780c1bc742181ded4930842b46e9507372f0b1b963James Dong
3790c1bc742181ded4930842b46e9507372f0b1b963James Dong
3800c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;***************************
3810c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Store all the 4x4 pixels
3820c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;***************************
3830c1bc742181ded4930842b46e9507372f0b1b963James Dong
3840c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDst,{out00,out02,out10,out12,out20,out22,out30,out32}
3850c1bc742181ded4930842b46e9507372f0b1b963James Dong
3860c1bc742181ded4930842b46e9507372f0b1b963James Dong
3870c1bc742181ded4930842b46e9507372f0b1b963James Dong
3880c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Set return value
3890c1bc742181ded4930842b46e9507372f0b1b963James Dong
3900c1bc742181ded4930842b46e9507372f0b1b963James DongEnd
3910c1bc742181ded4930842b46e9507372f0b1b963James Dong
3920c1bc742181ded4930842b46e9507372f0b1b963James Dong
3930c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Write function tail
3940c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_END
3950c1bc742181ded4930842b46e9507372f0b1b963James Dong
3960c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF                                                           ;//ARM1136JS
3970c1bc742181ded4930842b46e9507372f0b1b963James Dong
3980c1bc742181ded4930842b46e9507372f0b1b963James Dong
3990c1bc742181ded4930842b46e9507372f0b1b963James Dong
4000c1bc742181ded4930842b46e9507372f0b1b963James Dong
4010c1bc742181ded4930842b46e9507372f0b1b963James Dong
4020c1bc742181ded4930842b46e9507372f0b1b963James Dong
4030c1bc742181ded4930842b46e9507372f0b1b963James Dong
4040c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name
4050c1bc742181ded4930842b46e9507372f0b1b963James Dong
4060c1bc742181ded4930842b46e9507372f0b1b963James Dong
4070c1bc742181ded4930842b46e9507372f0b1b963James Dong    END