10c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 20c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 30c1bc742181ded4930842b46e9507372f0b1b963James Dong;// File Name: armVCM4P10_TransformResidual4x4_s.s 40c1bc742181ded4930842b46e9507372f0b1b963James Dong;// OpenMAX DL: v1.0.2 50c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Revision: 9641 60c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Date: Thursday, February 7, 2008 70c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 80c1bc742181ded4930842b46e9507372f0b1b963James Dong;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 90c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 100c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 110c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 120c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Description: 130c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transform Residual 4x4 Coefficients 140c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 150c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 160c1bc742181ded4930842b46e9507372f0b1b963James Dong 170c1bc742181ded4930842b46e9507372f0b1b963James Dong 180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Include standard headers 190c1bc742181ded4930842b46e9507372f0b1b963James Dong 200c1bc742181ded4930842b46e9507372f0b1b963James Dong INCLUDE omxtypes_s.h 210c1bc742181ded4930842b46e9507372f0b1b963James Dong INCLUDE armCOMM_s.h 220c1bc742181ded4930842b46e9507372f0b1b963James Dong 230c1bc742181ded4930842b46e9507372f0b1b963James Dong M_VARIANTS ARM1136JS 240c1bc742181ded4930842b46e9507372f0b1b963James Dong 250c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Import symbols required from other files 260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// (For example tables) 270c1bc742181ded4930842b46e9507372f0b1b963James Dong 280c1bc742181ded4930842b46e9507372f0b1b963James Dong 290c1bc742181ded4930842b46e9507372f0b1b963James Dong 300c1bc742181ded4930842b46e9507372f0b1b963James Dong 310c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Set debugging level 320c1bc742181ded4930842b46e9507372f0b1b963James Dong;//DEBUG_ON SETL {TRUE} 330c1bc742181ded4930842b46e9507372f0b1b963James Dong 340c1bc742181ded4930842b46e9507372f0b1b963James Dong 350c1bc742181ded4930842b46e9507372f0b1b963James Dong 360c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name 370c1bc742181ded4930842b46e9507372f0b1b963James Dong 380c1bc742181ded4930842b46e9507372f0b1b963James Dong IF ARM1136JS 390c1bc742181ded4930842b46e9507372f0b1b963James Dong 400c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Input Registers 410c1bc742181ded4930842b46e9507372f0b1b963James DongpDst RN 0 420c1bc742181ded4930842b46e9507372f0b1b963James DongpSrc RN 1 430c1bc742181ded4930842b46e9507372f0b1b963James Dong 440c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Output Registers 450c1bc742181ded4930842b46e9507372f0b1b963James Dong 460c1bc742181ded4930842b46e9507372f0b1b963James Dong 470c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Local Scratch Registers 480c1bc742181ded4930842b46e9507372f0b1b963James Dong 490c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Packed Input pixels 500c1bc742181ded4930842b46e9507372f0b1b963James Dongin00 RN 2 ;// Src[0] & Src[1] 510c1bc742181ded4930842b46e9507372f0b1b963James Dongin02 RN 3 ;// Src[2] & Src[3] 520c1bc742181ded4930842b46e9507372f0b1b963James Dongin10 RN 4 ;// Src[4] & Src[5] 530c1bc742181ded4930842b46e9507372f0b1b963James Dongin12 RN 5 ;// Src[6] & Src[7] 540c1bc742181ded4930842b46e9507372f0b1b963James Dongin20 RN 6 ;// Src[8] & Src[9] 550c1bc742181ded4930842b46e9507372f0b1b963James Dongin22 RN 7 ;// Src[10] & Src[11] 560c1bc742181ded4930842b46e9507372f0b1b963James Dongin30 RN 8 ;// Src[12] & Src[13] 570c1bc742181ded4930842b46e9507372f0b1b963James Dongin32 RN 9 ;// Src[14] & Src[15] 580c1bc742181ded4930842b46e9507372f0b1b963James Dong 590c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transpose for Row operations (Rows to cols) 600c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow00 RN 2 610c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow10 RN 10 620c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow02 RN 3 630c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow12 RN 5 640c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow20 RN 11 650c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow30 RN 12 660c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow32 RN 14 670c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow22 RN 7 680c1bc742181ded4930842b46e9507372f0b1b963James Dong 690c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Intermediate calculations 700c1bc742181ded4930842b46e9507372f0b1b963James Donge0 RN 4 710c1bc742181ded4930842b46e9507372f0b1b963James Donge1 RN 6 720c1bc742181ded4930842b46e9507372f0b1b963James Donge2 RN 8 730c1bc742181ded4930842b46e9507372f0b1b963James Donge3 RN 9 740c1bc742181ded4930842b46e9507372f0b1b963James DongconstZero RN 1 750c1bc742181ded4930842b46e9507372f0b1b963James Dong 760c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Row operated pixels 770c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp00 RN 2 780c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp10 RN 10 790c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp20 RN 11 800c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp30 RN 12 810c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp02 RN 3 820c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp12 RN 5 830c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp22 RN 7 840c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp32 RN 14 850c1bc742181ded4930842b46e9507372f0b1b963James Dong 860c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transpose for colulmn operations 870c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol00 RN 2 880c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol02 RN 3 890c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol10 RN 4 900c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol12 RN 5 910c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol20 RN 6 920c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol22 RN 7 930c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol30 RN 8 940c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol32 RN 9 950c1bc742181ded4930842b46e9507372f0b1b963James Dong 960c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Intermediate calculations 970c1bc742181ded4930842b46e9507372f0b1b963James Dongg0 RN 10 980c1bc742181ded4930842b46e9507372f0b1b963James Dongg1 RN 11 990c1bc742181ded4930842b46e9507372f0b1b963James Dongg2 RN 12 1000c1bc742181ded4930842b46e9507372f0b1b963James Dongg3 RN 14 1010c1bc742181ded4930842b46e9507372f0b1b963James Dong 1020c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Coloumn operated pixels 1030c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp00 RN 2 1040c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp02 RN 3 1050c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp10 RN 4 1060c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp12 RN 5 1070c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp20 RN 6 1080c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp22 RN 7 1090c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp30 RN 8 1100c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp32 RN 9 1110c1bc742181ded4930842b46e9507372f0b1b963James Dong 1120c1bc742181ded4930842b46e9507372f0b1b963James Dong 1130c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp1 RN 10 ;// Temporary scratch varaibles 1140c1bc742181ded4930842b46e9507372f0b1b963James Dongconst1 RN 11 1150c1bc742181ded4930842b46e9507372f0b1b963James Dongconst2 RN 12 1160c1bc742181ded4930842b46e9507372f0b1b963James Dongmask RN 14 1170c1bc742181ded4930842b46e9507372f0b1b963James Dong 1180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Output pixels 1190c1bc742181ded4930842b46e9507372f0b1b963James Dongout00 RN 2 1200c1bc742181ded4930842b46e9507372f0b1b963James Dongout02 RN 3 1210c1bc742181ded4930842b46e9507372f0b1b963James Dongout10 RN 4 1220c1bc742181ded4930842b46e9507372f0b1b963James Dongout12 RN 5 1230c1bc742181ded4930842b46e9507372f0b1b963James Dongout20 RN 6 1240c1bc742181ded4930842b46e9507372f0b1b963James Dongout22 RN 7 1250c1bc742181ded4930842b46e9507372f0b1b963James Dongout30 RN 8 1260c1bc742181ded4930842b46e9507372f0b1b963James Dongout32 RN 9 1270c1bc742181ded4930842b46e9507372f0b1b963James Dong 1280c1bc742181ded4930842b46e9507372f0b1b963James Dong 1290c1bc742181ded4930842b46e9507372f0b1b963James Dong 1300c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Allocate stack memory required by the function 1310c1bc742181ded4930842b46e9507372f0b1b963James Dong 1320c1bc742181ded4930842b46e9507372f0b1b963James Dong 1330c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Write function header 1340c1bc742181ded4930842b46e9507372f0b1b963James Dong M_START armVCM4P10_TransformResidual4x4,r11 1350c1bc742181ded4930842b46e9507372f0b1b963James Dong 1360c1bc742181ded4930842b46e9507372f0b1b963James Dong ;****************************************************************** 1370c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// The strategy used in implementing the transform is as follows:* 1380c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Load the 4x4 block into 8 registers * 1390c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Transpose the 4x4 matrix * 1400c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Perform the row operations (on columns) using SIMD * 1410c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Transpose the 4x4 result matrix * 1420c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Perform the coloumn operations * 1430c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Store the 4x4 block at one go * 1440c1bc742181ded4930842b46e9507372f0b1b963James Dong ;****************************************************************** 1450c1bc742181ded4930842b46e9507372f0b1b963James Dong 1460c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Load all the 4x4 pixels 1470c1bc742181ded4930842b46e9507372f0b1b963James Dong 1480c1bc742181ded4930842b46e9507372f0b1b963James Dong LDMIA pSrc,{in00,in02,in10,in12,in20,in22,in30,in32} 1490c1bc742181ded4930842b46e9507372f0b1b963James Dong 1500c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV constZero,#0 ;// Used to right shift by 1 1510c1bc742181ded4930842b46e9507372f0b1b963James Dong ;LDR constZero,=0x00000000 1520c1bc742181ded4930842b46e9507372f0b1b963James Dong 1530c1bc742181ded4930842b46e9507372f0b1b963James Dong ;***************************************************************** 1540c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1550c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Transpose the matrix inorder to perform row ops as coloumn ops 1560c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Input: in[][] = original matrix 1570c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Output: trRow[][]= transposed matrix 1580c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Step1: Obtain the LL part of the transposed matrix 1590c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Step2: Obtain the HL part 1600c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// step3: Obtain the LH part 1610c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Step4: Obtain the HH part 1620c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1630c1bc742181ded4930842b46e9507372f0b1b963James Dong ;***************************************************************** 1640c1bc742181ded4930842b46e9507372f0b1b963James Dong 1650c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// LL 2x2 transposed matrix 1660c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d0 d1 - - 1670c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d4 d5 - - 1680c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 1690c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 1700c1bc742181ded4930842b46e9507372f0b1b963James Dong 1710c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1] 1720c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0] 1730c1bc742181ded4930842b46e9507372f0b1b963James Dong 1740c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// HL 2x2 transposed matrix 1750c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 1760c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 1770c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d8 d9 - - 1780c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d12 d13 - - 1790c1bc742181ded4930842b46e9507372f0b1b963James Dong 1800c1bc742181ded4930842b46e9507372f0b1b963James Dong 1810c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3] 1820c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2] 1830c1bc742181ded4930842b46e9507372f0b1b963James Dong 1840c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// LH 2x2 transposed matrix 1850c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d2 d3 1860c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d6 d7 1870c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 1880c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 1890c1bc742181ded4930842b46e9507372f0b1b963James Dong 1900c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8] 1910c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9] 1920c1bc742181ded4930842b46e9507372f0b1b963James Dong 1930c1bc742181ded4930842b46e9507372f0b1b963James Dong 1940c1bc742181ded4930842b46e9507372f0b1b963James Dong 1950c1bc742181ded4930842b46e9507372f0b1b963James Dong 1960c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// HH 2x2 transposed matrix 1970c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 1980c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 1990c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d10 d11 2000c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d14 d15 2010c1bc742181ded4930842b46e9507372f0b1b963James Dong 2020c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11] 2030c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10] 2040c1bc742181ded4930842b46e9507372f0b1b963James Dong 2050c1bc742181ded4930842b46e9507372f0b1b963James Dong 2060c1bc742181ded4930842b46e9507372f0b1b963James Dong ;**************************************** 2070c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Row Operations (Performed on columns) 2080c1bc742181ded4930842b46e9507372f0b1b963James Dong ;**************************************** 2090c1bc742181ded4930842b46e9507372f0b1b963James Dong 2100c1bc742181ded4930842b46e9507372f0b1b963James Dong 2110c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// SIMD operations on first two columns(two rows of the original matrix) 2120c1bc742181ded4930842b46e9507372f0b1b963James Dong 2130c1bc742181ded4930842b46e9507372f0b1b963James Dong 2140c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 e0, trRow00,trRow20 ;// e0 = d0 + d2 2150c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 e1, trRow00,trRow20 ;// e1 = d0 - d2 2160c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 e2, trRow10,constZero ;// (f1>>1) constZero is a register holding 0 2170c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 e3, trRow30,constZero ;// avoid pipeline stalls for e2 and e3 2180c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 e2, e2, trRow30 ;// e2 = (d1>>1) - d3 2190c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 e3, e3, trRow10 ;// e3 = d1 + (d3>>1) 2200c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 rowOp00, e0, e3 ;// f0 = e0 + e3 2210c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 rowOp10, e1, e2 ;// f1 = e1 + e2 2220c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 rowOp20, e1, e2 ;// f2 = e1 - e2 2230c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 rowOp30, e0, e3 ;// f3 = e0 - e3 2240c1bc742181ded4930842b46e9507372f0b1b963James Dong 2250c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// SIMD operations on next two columns(next two rows of the original matrix) 2260c1bc742181ded4930842b46e9507372f0b1b963James Dong 2270c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 e0, trRow02,trRow22 2280c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 e1, trRow02,trRow22 2290c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 e2, trRow12,constZero ;//(f1>>1) constZero is a register holding 0 2300c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 e3, trRow32,constZero 2310c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 e2, e2, trRow32 2320c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 e3, e3, trRow12 2330c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 rowOp02, e0, e3 2340c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 rowOp12, e1, e2 2350c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 rowOp22, e1, e2 2360c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 rowOp32, e0, e3 2370c1bc742181ded4930842b46e9507372f0b1b963James Dong 2380c1bc742181ded4930842b46e9507372f0b1b963James Dong 2390c1bc742181ded4930842b46e9507372f0b1b963James Dong ;***************************************************************** 2400c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Transpose the resultant matrix 2410c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Input: rowOp[][] 2420c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Output: trCol[][] 2430c1bc742181ded4930842b46e9507372f0b1b963James Dong ;***************************************************************** 2440c1bc742181ded4930842b46e9507372f0b1b963James Dong 2450c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// LL 2x2 transposed matrix 2460c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d0 d1 - - 2470c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d4 d5 - - 2480c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2490c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2500c1bc742181ded4930842b46e9507372f0b1b963James Dong 2510c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1] 2520c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0] 2530c1bc742181ded4930842b46e9507372f0b1b963James Dong 2540c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// HL 2x2 transposed matrix 2550c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2560c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2570c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d8 d9 - - 2580c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d12 d13 - - 2590c1bc742181ded4930842b46e9507372f0b1b963James Dong 2600c1bc742181ded4930842b46e9507372f0b1b963James Dong 2610c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3] 2620c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2] 2630c1bc742181ded4930842b46e9507372f0b1b963James Dong 2640c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// LH 2x2 transposed matrix 2650c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d2 d3 2660c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d6 d7 2670c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2680c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2690c1bc742181ded4930842b46e9507372f0b1b963James Dong 2700c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8] 2710c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9] 2720c1bc742181ded4930842b46e9507372f0b1b963James Dong 2730c1bc742181ded4930842b46e9507372f0b1b963James Dong 2740c1bc742181ded4930842b46e9507372f0b1b963James Dong 2750c1bc742181ded4930842b46e9507372f0b1b963James Dong 2760c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// HH 2x2 transposed matrix 2770c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2780c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2790c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d10 d11 2800c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d14 d15 2810c1bc742181ded4930842b46e9507372f0b1b963James Dong 2820c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11] 2830c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10] 2840c1bc742181ded4930842b46e9507372f0b1b963James Dong 2850c1bc742181ded4930842b46e9507372f0b1b963James Dong 2860c1bc742181ded4930842b46e9507372f0b1b963James Dong ;******************************* 2870c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Coloumn Operations 2880c1bc742181ded4930842b46e9507372f0b1b963James Dong ;******************************* 2890c1bc742181ded4930842b46e9507372f0b1b963James Dong 2900c1bc742181ded4930842b46e9507372f0b1b963James Dong 2910c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// SIMD operations on first two columns 2920c1bc742181ded4930842b46e9507372f0b1b963James Dong 2930c1bc742181ded4930842b46e9507372f0b1b963James Dong 2940c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 g0, trCol00,trCol20 2950c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 g1, trCol00,trCol20 2960c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 g2, trCol10,constZero ;// (f1>>1) constZero is a register holding 0 2970c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 g3, trCol30,constZero 2980c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 g2, g2, trCol30 2990c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 g3, g3, trCol10 3000c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp00, g0, g3 3010c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp10, g1, g2 3020c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 colOp20, g1, g2 3030c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 colOp30, g0, g3 3040c1bc742181ded4930842b46e9507372f0b1b963James Dong 3050c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// SIMD operations on next two columns 3060c1bc742181ded4930842b46e9507372f0b1b963James Dong 3070c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 g0, trCol02,trCol22 3080c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 g1, trCol02,trCol22 3090c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 g2, trCol12,constZero ;// (f1>>1) constZero is a register holding 0 3100c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 g3, trCol32,constZero 3110c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 g2, g2, trCol32 3120c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 g3, g3, trCol12 3130c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp02, g0, g3 3140c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp12, g1, g2 3150c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 colOp22, g1, g2 3160c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 colOp32, g0, g3 3170c1bc742181ded4930842b46e9507372f0b1b963James Dong 3180c1bc742181ded4930842b46e9507372f0b1b963James Dong 3190c1bc742181ded4930842b46e9507372f0b1b963James Dong 3200c1bc742181ded4930842b46e9507372f0b1b963James Dong 3210c1bc742181ded4930842b46e9507372f0b1b963James Dong 3220c1bc742181ded4930842b46e9507372f0b1b963James Dong ;************************************************ 3230c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Calculate final value (colOp[i][j] + 32)>>6 3240c1bc742181ded4930842b46e9507372f0b1b963James Dong ;************************************************ 3250c1bc742181ded4930842b46e9507372f0b1b963James Dong 3260c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// const1: Serves dual purpose 3270c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result 3280c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768) 3290c1bc742181ded4930842b46e9507372f0b1b963James Dong 3300c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR const1, =0x00208020 3310c1bc742181ded4930842b46e9507372f0b1b963James Dong 3320c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR mask, =0xffff03ff ;// Used to mask the down shifted 6 bits 3330c1bc742181ded4930842b46e9507372f0b1b963James Dong 3340c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// const2(#512): used to convert the lower 16bit number back to signed value 3350c1bc742181ded4930842b46e9507372f0b1b963James Dong 3360c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV const2,#0x200 ;// const2 = 2^9 3370c1bc742181ded4930842b46e9507372f0b1b963James Dong 3380c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// First Row 3390c1bc742181ded4930842b46e9507372f0b1b963James Dong 3400c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp00, colOp00, const1 3410c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp02, colOp02, const1 3420c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp00, mask, colOp00, ASR #6 3430c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp02, mask, colOp02, ASR #6 3440c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out00,colOp00,const2 3450c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out02,colOp02,const2 3460c1bc742181ded4930842b46e9507372f0b1b963James Dong 3470c1bc742181ded4930842b46e9507372f0b1b963James Dong 3480c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Second Row 3490c1bc742181ded4930842b46e9507372f0b1b963James Dong 3500c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp10, colOp10, const1 3510c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp12, colOp12, const1 3520c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp10, mask, colOp10, ASR #6 3530c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp12, mask, colOp12, ASR #6 3540c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out10,colOp10,const2 3550c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out12,colOp12,const2 3560c1bc742181ded4930842b46e9507372f0b1b963James Dong 3570c1bc742181ded4930842b46e9507372f0b1b963James Dong 3580c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Third Row 3590c1bc742181ded4930842b46e9507372f0b1b963James Dong 3600c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp20, colOp20, const1 3610c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp22, colOp22, const1 3620c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp20, mask, colOp20, ASR #6 3630c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp22, mask, colOp22, ASR #6 3640c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out20,colOp20,const2 3650c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out22,colOp22,const2 3660c1bc742181ded4930842b46e9507372f0b1b963James Dong 3670c1bc742181ded4930842b46e9507372f0b1b963James Dong 3680c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Fourth Row 3690c1bc742181ded4930842b46e9507372f0b1b963James Dong 3700c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp30, colOp30, const1 3710c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp32, colOp32, const1 3720c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp30, mask, colOp30, ASR #6 3730c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp32, mask, colOp32, ASR #6 3740c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out30,colOp30,const2 3750c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out32,colOp32,const2 3760c1bc742181ded4930842b46e9507372f0b1b963James Dong 3770c1bc742181ded4930842b46e9507372f0b1b963James Dong 3780c1bc742181ded4930842b46e9507372f0b1b963James Dong 3790c1bc742181ded4930842b46e9507372f0b1b963James Dong 3800c1bc742181ded4930842b46e9507372f0b1b963James Dong ;*************************** 3810c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Store all the 4x4 pixels 3820c1bc742181ded4930842b46e9507372f0b1b963James Dong ;*************************** 3830c1bc742181ded4930842b46e9507372f0b1b963James Dong 3840c1bc742181ded4930842b46e9507372f0b1b963James Dong STMIA pDst,{out00,out02,out10,out12,out20,out22,out30,out32} 3850c1bc742181ded4930842b46e9507372f0b1b963James Dong 3860c1bc742181ded4930842b46e9507372f0b1b963James Dong 3870c1bc742181ded4930842b46e9507372f0b1b963James Dong 3880c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Set return value 3890c1bc742181ded4930842b46e9507372f0b1b963James Dong 3900c1bc742181ded4930842b46e9507372f0b1b963James DongEnd 3910c1bc742181ded4930842b46e9507372f0b1b963James Dong 3920c1bc742181ded4930842b46e9507372f0b1b963James Dong 3930c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Write function tail 3940c1bc742181ded4930842b46e9507372f0b1b963James Dong M_END 3950c1bc742181ded4930842b46e9507372f0b1b963James Dong 3960c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF ;//ARM1136JS 3970c1bc742181ded4930842b46e9507372f0b1b963James Dong 3980c1bc742181ded4930842b46e9507372f0b1b963James Dong 3990c1bc742181ded4930842b46e9507372f0b1b963James Dong 4000c1bc742181ded4930842b46e9507372f0b1b963James Dong 4010c1bc742181ded4930842b46e9507372f0b1b963James Dong 4020c1bc742181ded4930842b46e9507372f0b1b963James Dong 4030c1bc742181ded4930842b46e9507372f0b1b963James Dong 4040c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name 4050c1bc742181ded4930842b46e9507372f0b1b963James Dong 4060c1bc742181ded4930842b46e9507372f0b1b963James Dong 4070c1bc742181ded4930842b46e9507372f0b1b963James Dong END