10c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Copyright (C) 2007-2008 ARM Limited 378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// 478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Licensed under the Apache License, Version 2.0 (the "License"); 578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// you may not use this file except in compliance with the License. 678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// You may obtain a copy of the License at 778e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// 878e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// http://www.apache.org/licenses/LICENSE-2.0 978e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// 1078e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Unless required by applicable law or agreed to in writing, software 1178e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// distributed under the License is distributed on an "AS IS" BASIS, 1278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// See the License for the specific language governing permissions and 1478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// limitations under the License. 1578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// 1678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// 170c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// File Name: armVCM4P10_TransformResidual4x4_s.s 190c1bc742181ded4930842b46e9507372f0b1b963James Dong;// OpenMAX DL: v1.0.2 200c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Revision: 9641 210c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Date: Thursday, February 7, 2008 220c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 230c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 240c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 250c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Description: 270c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transform Residual 4x4 Coefficients 280c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 290c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 300c1bc742181ded4930842b46e9507372f0b1b963James Dong 310c1bc742181ded4930842b46e9507372f0b1b963James Dong 320c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Include standard headers 330c1bc742181ded4930842b46e9507372f0b1b963James Dong 340c1bc742181ded4930842b46e9507372f0b1b963James Dong INCLUDE omxtypes_s.h 350c1bc742181ded4930842b46e9507372f0b1b963James Dong INCLUDE armCOMM_s.h 360c1bc742181ded4930842b46e9507372f0b1b963James Dong 370c1bc742181ded4930842b46e9507372f0b1b963James Dong M_VARIANTS ARM1136JS 380c1bc742181ded4930842b46e9507372f0b1b963James Dong 390c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Import symbols required from other files 400c1bc742181ded4930842b46e9507372f0b1b963James Dong;// (For example tables) 410c1bc742181ded4930842b46e9507372f0b1b963James Dong 420c1bc742181ded4930842b46e9507372f0b1b963James Dong 430c1bc742181ded4930842b46e9507372f0b1b963James Dong 440c1bc742181ded4930842b46e9507372f0b1b963James Dong 450c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Set debugging level 460c1bc742181ded4930842b46e9507372f0b1b963James Dong;//DEBUG_ON SETL {TRUE} 470c1bc742181ded4930842b46e9507372f0b1b963James Dong 480c1bc742181ded4930842b46e9507372f0b1b963James Dong 490c1bc742181ded4930842b46e9507372f0b1b963James Dong 500c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name 510c1bc742181ded4930842b46e9507372f0b1b963James Dong 520c1bc742181ded4930842b46e9507372f0b1b963James Dong IF ARM1136JS 530c1bc742181ded4930842b46e9507372f0b1b963James Dong 540c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Input Registers 550c1bc742181ded4930842b46e9507372f0b1b963James DongpDst RN 0 560c1bc742181ded4930842b46e9507372f0b1b963James DongpSrc RN 1 570c1bc742181ded4930842b46e9507372f0b1b963James Dong 580c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Output Registers 590c1bc742181ded4930842b46e9507372f0b1b963James Dong 600c1bc742181ded4930842b46e9507372f0b1b963James Dong 610c1bc742181ded4930842b46e9507372f0b1b963James Dong;//Local Scratch Registers 620c1bc742181ded4930842b46e9507372f0b1b963James Dong 630c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Packed Input pixels 640c1bc742181ded4930842b46e9507372f0b1b963James Dongin00 RN 2 ;// Src[0] & Src[1] 650c1bc742181ded4930842b46e9507372f0b1b963James Dongin02 RN 3 ;// Src[2] & Src[3] 660c1bc742181ded4930842b46e9507372f0b1b963James Dongin10 RN 4 ;// Src[4] & Src[5] 670c1bc742181ded4930842b46e9507372f0b1b963James Dongin12 RN 5 ;// Src[6] & Src[7] 680c1bc742181ded4930842b46e9507372f0b1b963James Dongin20 RN 6 ;// Src[8] & Src[9] 690c1bc742181ded4930842b46e9507372f0b1b963James Dongin22 RN 7 ;// Src[10] & Src[11] 700c1bc742181ded4930842b46e9507372f0b1b963James Dongin30 RN 8 ;// Src[12] & Src[13] 710c1bc742181ded4930842b46e9507372f0b1b963James Dongin32 RN 9 ;// Src[14] & Src[15] 720c1bc742181ded4930842b46e9507372f0b1b963James Dong 730c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transpose for Row operations (Rows to cols) 740c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow00 RN 2 750c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow10 RN 10 760c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow02 RN 3 770c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow12 RN 5 780c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow20 RN 11 790c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow30 RN 12 800c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow32 RN 14 810c1bc742181ded4930842b46e9507372f0b1b963James DongtrRow22 RN 7 820c1bc742181ded4930842b46e9507372f0b1b963James Dong 830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Intermediate calculations 840c1bc742181ded4930842b46e9507372f0b1b963James Donge0 RN 4 850c1bc742181ded4930842b46e9507372f0b1b963James Donge1 RN 6 860c1bc742181ded4930842b46e9507372f0b1b963James Donge2 RN 8 870c1bc742181ded4930842b46e9507372f0b1b963James Donge3 RN 9 880c1bc742181ded4930842b46e9507372f0b1b963James DongconstZero RN 1 890c1bc742181ded4930842b46e9507372f0b1b963James Dong 900c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Row operated pixels 910c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp00 RN 2 920c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp10 RN 10 930c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp20 RN 11 940c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp30 RN 12 950c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp02 RN 3 960c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp12 RN 5 970c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp22 RN 7 980c1bc742181ded4930842b46e9507372f0b1b963James DongrowOp32 RN 14 990c1bc742181ded4930842b46e9507372f0b1b963James Dong 1000c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transpose for colulmn operations 1010c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol00 RN 2 1020c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol02 RN 3 1030c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol10 RN 4 1040c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol12 RN 5 1050c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol20 RN 6 1060c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol22 RN 7 1070c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol30 RN 8 1080c1bc742181ded4930842b46e9507372f0b1b963James DongtrCol32 RN 9 1090c1bc742181ded4930842b46e9507372f0b1b963James Dong 1100c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Intermediate calculations 1110c1bc742181ded4930842b46e9507372f0b1b963James Dongg0 RN 10 1120c1bc742181ded4930842b46e9507372f0b1b963James Dongg1 RN 11 1130c1bc742181ded4930842b46e9507372f0b1b963James Dongg2 RN 12 1140c1bc742181ded4930842b46e9507372f0b1b963James Dongg3 RN 14 1150c1bc742181ded4930842b46e9507372f0b1b963James Dong 1160c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Coloumn operated pixels 1170c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp00 RN 2 1180c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp02 RN 3 1190c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp10 RN 4 1200c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp12 RN 5 1210c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp20 RN 6 1220c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp22 RN 7 1230c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp30 RN 8 1240c1bc742181ded4930842b46e9507372f0b1b963James DongcolOp32 RN 9 1250c1bc742181ded4930842b46e9507372f0b1b963James Dong 1260c1bc742181ded4930842b46e9507372f0b1b963James Dong 1270c1bc742181ded4930842b46e9507372f0b1b963James Dongtemp1 RN 10 ;// Temporary scratch varaibles 1280c1bc742181ded4930842b46e9507372f0b1b963James Dongconst1 RN 11 1290c1bc742181ded4930842b46e9507372f0b1b963James Dongconst2 RN 12 1300c1bc742181ded4930842b46e9507372f0b1b963James Dongmask RN 14 1310c1bc742181ded4930842b46e9507372f0b1b963James Dong 1320c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Output pixels 1330c1bc742181ded4930842b46e9507372f0b1b963James Dongout00 RN 2 1340c1bc742181ded4930842b46e9507372f0b1b963James Dongout02 RN 3 1350c1bc742181ded4930842b46e9507372f0b1b963James Dongout10 RN 4 1360c1bc742181ded4930842b46e9507372f0b1b963James Dongout12 RN 5 1370c1bc742181ded4930842b46e9507372f0b1b963James Dongout20 RN 6 1380c1bc742181ded4930842b46e9507372f0b1b963James Dongout22 RN 7 1390c1bc742181ded4930842b46e9507372f0b1b963James Dongout30 RN 8 1400c1bc742181ded4930842b46e9507372f0b1b963James Dongout32 RN 9 1410c1bc742181ded4930842b46e9507372f0b1b963James Dong 1420c1bc742181ded4930842b46e9507372f0b1b963James Dong 1430c1bc742181ded4930842b46e9507372f0b1b963James Dong 1440c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Allocate stack memory required by the function 1450c1bc742181ded4930842b46e9507372f0b1b963James Dong 1460c1bc742181ded4930842b46e9507372f0b1b963James Dong 1470c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Write function header 1480c1bc742181ded4930842b46e9507372f0b1b963James Dong M_START armVCM4P10_TransformResidual4x4,r11 1490c1bc742181ded4930842b46e9507372f0b1b963James Dong 1500c1bc742181ded4930842b46e9507372f0b1b963James Dong ;****************************************************************** 1510c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// The strategy used in implementing the transform is as follows:* 1520c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Load the 4x4 block into 8 registers * 1530c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Transpose the 4x4 matrix * 1540c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Perform the row operations (on columns) using SIMD * 1550c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Transpose the 4x4 result matrix * 1560c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Perform the coloumn operations * 1570c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Store the 4x4 block at one go * 1580c1bc742181ded4930842b46e9507372f0b1b963James Dong ;****************************************************************** 1590c1bc742181ded4930842b46e9507372f0b1b963James Dong 1600c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Load all the 4x4 pixels 1610c1bc742181ded4930842b46e9507372f0b1b963James Dong 1620c1bc742181ded4930842b46e9507372f0b1b963James Dong LDMIA pSrc,{in00,in02,in10,in12,in20,in22,in30,in32} 1630c1bc742181ded4930842b46e9507372f0b1b963James Dong 1640c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV constZero,#0 ;// Used to right shift by 1 1650c1bc742181ded4930842b46e9507372f0b1b963James Dong ;LDR constZero,=0x00000000 1660c1bc742181ded4930842b46e9507372f0b1b963James Dong 1670c1bc742181ded4930842b46e9507372f0b1b963James Dong ;***************************************************************** 1680c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1690c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Transpose the matrix inorder to perform row ops as coloumn ops 1700c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Input: in[][] = original matrix 1710c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Output: trRow[][]= transposed matrix 1720c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Step1: Obtain the LL part of the transposed matrix 1730c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Step2: Obtain the HL part 1740c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// step3: Obtain the LH part 1750c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Step4: Obtain the HH part 1760c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1770c1bc742181ded4930842b46e9507372f0b1b963James Dong ;***************************************************************** 1780c1bc742181ded4930842b46e9507372f0b1b963James Dong 1790c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// LL 2x2 transposed matrix 1800c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d0 d1 - - 1810c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d4 d5 - - 1820c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 1830c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 1840c1bc742181ded4930842b46e9507372f0b1b963James Dong 1850c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1] 1860c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0] 1870c1bc742181ded4930842b46e9507372f0b1b963James Dong 1880c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// HL 2x2 transposed matrix 1890c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 1900c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 1910c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d8 d9 - - 1920c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d12 d13 - - 1930c1bc742181ded4930842b46e9507372f0b1b963James Dong 1940c1bc742181ded4930842b46e9507372f0b1b963James Dong 1950c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3] 1960c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2] 1970c1bc742181ded4930842b46e9507372f0b1b963James Dong 1980c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// LH 2x2 transposed matrix 1990c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d2 d3 2000c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d6 d7 2010c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2020c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2030c1bc742181ded4930842b46e9507372f0b1b963James Dong 2040c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8] 2050c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9] 2060c1bc742181ded4930842b46e9507372f0b1b963James Dong 2070c1bc742181ded4930842b46e9507372f0b1b963James Dong 2080c1bc742181ded4930842b46e9507372f0b1b963James Dong 2090c1bc742181ded4930842b46e9507372f0b1b963James Dong 2100c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// HH 2x2 transposed matrix 2110c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2120c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2130c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d10 d11 2140c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d14 d15 2150c1bc742181ded4930842b46e9507372f0b1b963James Dong 2160c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11] 2170c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10] 2180c1bc742181ded4930842b46e9507372f0b1b963James Dong 2190c1bc742181ded4930842b46e9507372f0b1b963James Dong 2200c1bc742181ded4930842b46e9507372f0b1b963James Dong ;**************************************** 2210c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Row Operations (Performed on columns) 2220c1bc742181ded4930842b46e9507372f0b1b963James Dong ;**************************************** 2230c1bc742181ded4930842b46e9507372f0b1b963James Dong 2240c1bc742181ded4930842b46e9507372f0b1b963James Dong 2250c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// SIMD operations on first two columns(two rows of the original matrix) 2260c1bc742181ded4930842b46e9507372f0b1b963James Dong 2270c1bc742181ded4930842b46e9507372f0b1b963James Dong 2280c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 e0, trRow00,trRow20 ;// e0 = d0 + d2 2290c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 e1, trRow00,trRow20 ;// e1 = d0 - d2 2300c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 e2, trRow10,constZero ;// (f1>>1) constZero is a register holding 0 2310c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 e3, trRow30,constZero ;// avoid pipeline stalls for e2 and e3 2320c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 e2, e2, trRow30 ;// e2 = (d1>>1) - d3 2330c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 e3, e3, trRow10 ;// e3 = d1 + (d3>>1) 2340c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 rowOp00, e0, e3 ;// f0 = e0 + e3 2350c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 rowOp10, e1, e2 ;// f1 = e1 + e2 2360c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 rowOp20, e1, e2 ;// f2 = e1 - e2 2370c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 rowOp30, e0, e3 ;// f3 = e0 - e3 2380c1bc742181ded4930842b46e9507372f0b1b963James Dong 2390c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// SIMD operations on next two columns(next two rows of the original matrix) 2400c1bc742181ded4930842b46e9507372f0b1b963James Dong 2410c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 e0, trRow02,trRow22 2420c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 e1, trRow02,trRow22 2430c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 e2, trRow12,constZero ;//(f1>>1) constZero is a register holding 0 2440c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 e3, trRow32,constZero 2450c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 e2, e2, trRow32 2460c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 e3, e3, trRow12 2470c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 rowOp02, e0, e3 2480c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 rowOp12, e1, e2 2490c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 rowOp22, e1, e2 2500c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 rowOp32, e0, e3 2510c1bc742181ded4930842b46e9507372f0b1b963James Dong 2520c1bc742181ded4930842b46e9507372f0b1b963James Dong 2530c1bc742181ded4930842b46e9507372f0b1b963James Dong ;***************************************************************** 2540c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Transpose the resultant matrix 2550c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Input: rowOp[][] 2560c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Output: trCol[][] 2570c1bc742181ded4930842b46e9507372f0b1b963James Dong ;***************************************************************** 2580c1bc742181ded4930842b46e9507372f0b1b963James Dong 2590c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// LL 2x2 transposed matrix 2600c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d0 d1 - - 2610c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d4 d5 - - 2620c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2630c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2640c1bc742181ded4930842b46e9507372f0b1b963James Dong 2650c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1] 2660c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0] 2670c1bc742181ded4930842b46e9507372f0b1b963James Dong 2680c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// HL 2x2 transposed matrix 2690c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2700c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2710c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d8 d9 - - 2720c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// d12 d13 - - 2730c1bc742181ded4930842b46e9507372f0b1b963James Dong 2740c1bc742181ded4930842b46e9507372f0b1b963James Dong 2750c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3] 2760c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2] 2770c1bc742181ded4930842b46e9507372f0b1b963James Dong 2780c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// LH 2x2 transposed matrix 2790c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d2 d3 2800c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d6 d7 2810c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2820c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2830c1bc742181ded4930842b46e9507372f0b1b963James Dong 2840c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8] 2850c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9] 2860c1bc742181ded4930842b46e9507372f0b1b963James Dong 2870c1bc742181ded4930842b46e9507372f0b1b963James Dong 2880c1bc742181ded4930842b46e9507372f0b1b963James Dong 2890c1bc742181ded4930842b46e9507372f0b1b963James Dong 2900c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// HH 2x2 transposed matrix 2910c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2920c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - - - 2930c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d10 d11 2940c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// - - d14 d15 2950c1bc742181ded4930842b46e9507372f0b1b963James Dong 2960c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11] 2970c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10] 2980c1bc742181ded4930842b46e9507372f0b1b963James Dong 2990c1bc742181ded4930842b46e9507372f0b1b963James Dong 3000c1bc742181ded4930842b46e9507372f0b1b963James Dong ;******************************* 3010c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Coloumn Operations 3020c1bc742181ded4930842b46e9507372f0b1b963James Dong ;******************************* 3030c1bc742181ded4930842b46e9507372f0b1b963James Dong 3040c1bc742181ded4930842b46e9507372f0b1b963James Dong 3050c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// SIMD operations on first two columns 3060c1bc742181ded4930842b46e9507372f0b1b963James Dong 3070c1bc742181ded4930842b46e9507372f0b1b963James Dong 3080c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 g0, trCol00,trCol20 3090c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 g1, trCol00,trCol20 3100c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 g2, trCol10,constZero ;// (f1>>1) constZero is a register holding 0 3110c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 g3, trCol30,constZero 3120c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 g2, g2, trCol30 3130c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 g3, g3, trCol10 3140c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp00, g0, g3 3150c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp10, g1, g2 3160c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 colOp20, g1, g2 3170c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 colOp30, g0, g3 3180c1bc742181ded4930842b46e9507372f0b1b963James Dong 3190c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// SIMD operations on next two columns 3200c1bc742181ded4930842b46e9507372f0b1b963James Dong 3210c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 g0, trCol02,trCol22 3220c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 g1, trCol02,trCol22 3230c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 g2, trCol12,constZero ;// (f1>>1) constZero is a register holding 0 3240c1bc742181ded4930842b46e9507372f0b1b963James Dong SHADD16 g3, trCol32,constZero 3250c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 g2, g2, trCol32 3260c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 g3, g3, trCol12 3270c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp02, g0, g3 3280c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp12, g1, g2 3290c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 colOp22, g1, g2 3300c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 colOp32, g0, g3 3310c1bc742181ded4930842b46e9507372f0b1b963James Dong 3320c1bc742181ded4930842b46e9507372f0b1b963James Dong 3330c1bc742181ded4930842b46e9507372f0b1b963James Dong 3340c1bc742181ded4930842b46e9507372f0b1b963James Dong 3350c1bc742181ded4930842b46e9507372f0b1b963James Dong 3360c1bc742181ded4930842b46e9507372f0b1b963James Dong ;************************************************ 3370c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Calculate final value (colOp[i][j] + 32)>>6 3380c1bc742181ded4930842b46e9507372f0b1b963James Dong ;************************************************ 3390c1bc742181ded4930842b46e9507372f0b1b963James Dong 3400c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// const1: Serves dual purpose 3410c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result 3420c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768) 3430c1bc742181ded4930842b46e9507372f0b1b963James Dong 3440c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR const1, =0x00208020 3450c1bc742181ded4930842b46e9507372f0b1b963James Dong 3460c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR mask, =0xffff03ff ;// Used to mask the down shifted 6 bits 3470c1bc742181ded4930842b46e9507372f0b1b963James Dong 3480c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// const2(#512): used to convert the lower 16bit number back to signed value 3490c1bc742181ded4930842b46e9507372f0b1b963James Dong 3500c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV const2,#0x200 ;// const2 = 2^9 3510c1bc742181ded4930842b46e9507372f0b1b963James Dong 3520c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// First Row 3530c1bc742181ded4930842b46e9507372f0b1b963James Dong 3540c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp00, colOp00, const1 3550c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp02, colOp02, const1 3560c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp00, mask, colOp00, ASR #6 3570c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp02, mask, colOp02, ASR #6 3580c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out00,colOp00,const2 3590c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out02,colOp02,const2 3600c1bc742181ded4930842b46e9507372f0b1b963James Dong 3610c1bc742181ded4930842b46e9507372f0b1b963James Dong 3620c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Second Row 3630c1bc742181ded4930842b46e9507372f0b1b963James Dong 3640c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp10, colOp10, const1 3650c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp12, colOp12, const1 3660c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp10, mask, colOp10, ASR #6 3670c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp12, mask, colOp12, ASR #6 3680c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out10,colOp10,const2 3690c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out12,colOp12,const2 3700c1bc742181ded4930842b46e9507372f0b1b963James Dong 3710c1bc742181ded4930842b46e9507372f0b1b963James Dong 3720c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Third Row 3730c1bc742181ded4930842b46e9507372f0b1b963James Dong 3740c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp20, colOp20, const1 3750c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp22, colOp22, const1 3760c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp20, mask, colOp20, ASR #6 3770c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp22, mask, colOp22, ASR #6 3780c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out20,colOp20,const2 3790c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out22,colOp22,const2 3800c1bc742181ded4930842b46e9507372f0b1b963James Dong 3810c1bc742181ded4930842b46e9507372f0b1b963James Dong 3820c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Fourth Row 3830c1bc742181ded4930842b46e9507372f0b1b963James Dong 3840c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp30, colOp30, const1 3850c1bc742181ded4930842b46e9507372f0b1b963James Dong SADD16 colOp32, colOp32, const1 3860c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp30, mask, colOp30, ASR #6 3870c1bc742181ded4930842b46e9507372f0b1b963James Dong AND colOp32, mask, colOp32, ASR #6 3880c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out30,colOp30,const2 3890c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 out32,colOp32,const2 3900c1bc742181ded4930842b46e9507372f0b1b963James Dong 3910c1bc742181ded4930842b46e9507372f0b1b963James Dong 3920c1bc742181ded4930842b46e9507372f0b1b963James Dong 3930c1bc742181ded4930842b46e9507372f0b1b963James Dong 3940c1bc742181ded4930842b46e9507372f0b1b963James Dong ;*************************** 3950c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Store all the 4x4 pixels 3960c1bc742181ded4930842b46e9507372f0b1b963James Dong ;*************************** 3970c1bc742181ded4930842b46e9507372f0b1b963James Dong 3980c1bc742181ded4930842b46e9507372f0b1b963James Dong STMIA pDst,{out00,out02,out10,out12,out20,out22,out30,out32} 3990c1bc742181ded4930842b46e9507372f0b1b963James Dong 4000c1bc742181ded4930842b46e9507372f0b1b963James Dong 4010c1bc742181ded4930842b46e9507372f0b1b963James Dong 4020c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Set return value 4030c1bc742181ded4930842b46e9507372f0b1b963James Dong 4040c1bc742181ded4930842b46e9507372f0b1b963James DongEnd 4050c1bc742181ded4930842b46e9507372f0b1b963James Dong 4060c1bc742181ded4930842b46e9507372f0b1b963James Dong 4070c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Write function tail 4080c1bc742181ded4930842b46e9507372f0b1b963James Dong M_END 4090c1bc742181ded4930842b46e9507372f0b1b963James Dong 4100c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF ;//ARM1136JS 4110c1bc742181ded4930842b46e9507372f0b1b963James Dong 4120c1bc742181ded4930842b46e9507372f0b1b963James Dong 4130c1bc742181ded4930842b46e9507372f0b1b963James Dong 4140c1bc742181ded4930842b46e9507372f0b1b963James Dong 4150c1bc742181ded4930842b46e9507372f0b1b963James Dong 4160c1bc742181ded4930842b46e9507372f0b1b963James Dong 4170c1bc742181ded4930842b46e9507372f0b1b963James Dong 4180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Guarding implementation by the processor name 4190c1bc742181ded4930842b46e9507372f0b1b963James Dong 4200c1bc742181ded4930842b46e9507372f0b1b963James Dong 42178e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar END 422