10c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Copyright (C) 2007-2008 ARM Limited 378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// 478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Licensed under the Apache License, Version 2.0 (the "License"); 578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// you may not use this file except in compliance with the License. 678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// You may obtain a copy of the License at 778e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// 878e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// http://www.apache.org/licenses/LICENSE-2.0 978e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// 1078e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Unless required by applicable law or agreed to in writing, software 1178e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// distributed under the License is distributed on an "AS IS" BASIS, 1278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// See the License for the specific language governing permissions and 1478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// limitations under the License. 1578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// 1678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// 170c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// File Name: omxVCM4P2_MCReconBlock_s.s 190c1bc742181ded4930842b46e9507372f0b1b963James Dong;// OpenMAX DL: v1.0.2 200c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Revision: 9641 210c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Date: Thursday, February 7, 2008 220c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 230c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 240c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 250c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Description: 270c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 280c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 290c1bc742181ded4930842b46e9507372f0b1b963James Dong 300c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Include standard headers 310c1bc742181ded4930842b46e9507372f0b1b963James Dong INCLUDE omxtypes_s.h 320c1bc742181ded4930842b46e9507372f0b1b963James Dong INCLUDE armCOMM_s.h 330c1bc742181ded4930842b46e9507372f0b1b963James Dong 340c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Import symbols required from other files 350c1bc742181ded4930842b46e9507372f0b1b963James Dong 360c1bc742181ded4930842b46e9507372f0b1b963James Dong M_VARIANTS ARM1136JS 370c1bc742181ded4930842b46e9507372f0b1b963James Dong 380c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 390c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ARM1136JS implementation 400c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 410c1bc742181ded4930842b46e9507372f0b1b963James Dong IF ARM1136JS 420c1bc742181ded4930842b46e9507372f0b1b963James Dong 430c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 440c1bc742181ded4930842b46e9507372f0b1b963James Dong;// MACRO DEFINITIONS 450c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 460c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Description: 470c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 480c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// dest[j] = (x[j] + y[j] + round) >> 1, j=0..3 490c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 500c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to 510c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// each sum before dividing by two, if round is 1 520c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 530c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Syntax: 540c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_UHADD8R $dest, $x, $y, $round, $mask 550c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 560c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Inputs: 570c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $x four packed bytes, x[3] : x[2] : x[1] : x[0] 580c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $y four packed bytes, y[3] : y[2] : y[1] : y[0] 590c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $round 0 if no rounding to be added, 1 if rounding to be done 600c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $mask some register set to 0x80808080 610c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 620c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Outputs: 630c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $dest four packed bytes, z[3] : z[2] : z[1] : z[0] 640c1bc742181ded4930842b46e9507372f0b1b963James Dong 650c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 660c1bc742181ded4930842b46e9507372f0b1b963James Dong M_UHADD8R $dest, $x, $y, $round, $mask 670c1bc742181ded4930842b46e9507372f0b1b963James Dong IF $round = 1 680c1bc742181ded4930842b46e9507372f0b1b963James Dong IF $dest /= $y 690c1bc742181ded4930842b46e9507372f0b1b963James Dong MVN $dest, $x 700c1bc742181ded4930842b46e9507372f0b1b963James Dong UHSUB8 $dest, $y, $dest 710c1bc742181ded4930842b46e9507372f0b1b963James Dong EOR $dest, $dest, $mask 720c1bc742181ded4930842b46e9507372f0b1b963James Dong ELSE 730c1bc742181ded4930842b46e9507372f0b1b963James Dong MVN $dest, $y 740c1bc742181ded4930842b46e9507372f0b1b963James Dong UHSUB8 $dest, $x, $dest 750c1bc742181ded4930842b46e9507372f0b1b963James Dong EOR $dest, $dest, $mask 760c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF 770c1bc742181ded4930842b46e9507372f0b1b963James Dong ELSE 780c1bc742181ded4930842b46e9507372f0b1b963James Dong UHADD8 $dest, $x, $y 790c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF 800c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 810c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 820c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Description: 830c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Load 8 bytes from $pSrc (aligned or unaligned locations) 840c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 850c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Syntax: 860c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_LOAD_X $pSrc, $srcStep, $out0, $out1, $scratch, $offset 870c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 880c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Inputs: 890c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $pSrc 4 byte aligned source pointer to an address just less than 900c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// or equal to the data location 910c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $srcStep The stride on source 920c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $scratch A scratch register, used internally for temp calculations 930c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $offset Difference of source data location to the source pointer 940c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Use when $offset != 0 (unaligned load) 950c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 960c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Outputs: 970c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $pSrc In case the macro accepts stride, it increments the pSrc by 980c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// that value, else unchanged 990c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $out0 four packed bytes, z[3] : z[2] : z[1] : z[0] 1000c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $out1 four packed bytes, z[7] : z[6] : z[5] : z[4] 1010c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1020c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Note: {$out0, $out1, $scratch} should be registers with ascending 1030c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// register numbering. In case offset is 0, $scratch is not modified. 1040c1bc742181ded4930842b46e9507372f0b1b963James Dong 1050c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 1060c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_X $pSrc, $srcStep, $out0, $out1, $scratch, $offset 1070c1bc742181ded4930842b46e9507372f0b1b963James Dong IF $offset = 0 1080c1bc742181ded4930842b46e9507372f0b1b963James Dong LDM $pSrc, {$out0, $out1} 1090c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD $pSrc, $pSrc, $srcStep 1100c1bc742181ded4930842b46e9507372f0b1b963James Dong ELSE 1110c1bc742181ded4930842b46e9507372f0b1b963James Dong LDM $pSrc, {$out0, $out1, $scratch} 1120c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD $pSrc, $pSrc, $srcStep 1130c1bc742181ded4930842b46e9507372f0b1b963James Dong 1140c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV $out0, $out0, LSR #8 * $offset 1150c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR $out0, $out0, $out1, LSL #(32 - 8 * ($offset)) 1160c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV $out1, $out1, LSR #8 * $offset 1170c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR $out1, $out1, $scratch, LSL #(32 - 8 * ($offset)) 1180c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF 1190c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 1200c1bc742181ded4930842b46e9507372f0b1b963James Dong 1210c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 1220c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Description: 1230c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Loads three words for X interpolation, update pointer to next row. For 1240c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// X interpolation, given a truncated-4byteAligned source pointer, 1250c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// invariably three continous words are required from there to get the 1260c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// nine bytes from the source pointer for filtering. 1270c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1280c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Syntax: 1290c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3 1300c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1310c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Inputs: 1320c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $pSrc 4 byte aligned source pointer to an address just less than 1330c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// or equal to the data location 1340c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1350c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $srcStep The stride on source 1360c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1370c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $offset Difference of source data location to the source pointer 1380c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Use when $offset != 0 (unaligned load) 1390c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1400c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Outputs: 1410c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $pSrc Incremented by $srcStep 1420c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1430c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $word0, $word1, $word2, $word3 1440c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Three of these are outputs based on the $offset parameter. 1450c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// The outputs are specifically generated to be processed by 1460c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// the M_EXT_XINT macro. Following is the illustration to show 1470c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// how the nine bytes are spanned for different offsets from 1480c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// notTruncatedForAlignmentSourcePointer. 1490c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1500c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// ------------------------------------------------------ 1510c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// | Offset | Aligned Ptr | word0 | word1 | word2 | word3 | 1520c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// |------------------------------------------------------| 1530c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// | 0 | 0 | 0123 | 4567 | 8xxx | | 1540c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// | 1 | -1 | x012 | 3456 | 78xx | | 1550c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// | 2 | -2 | xx01 | 2345 | 678x | | 1560c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// | 3 | -3 | xxx0 | | 1234 | 5678 | 1570c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// ------------------------------------------------------ 1580c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1590c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// where the numbering (0-8) is to designate the 9 bytes from 1600c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// start of a particular row. The illustration doesn't take in 1610c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// account the positioning of bytes with in the word and the 1620c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// macro combination with M_EXT_XINT will work only in little 1630c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// endian environs 1640c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1650c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending 1660c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// register numbering 1670c1bc742181ded4930842b46e9507372f0b1b963James Dong 1680c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 1690c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3 1700c1bc742181ded4930842b46e9507372f0b1b963James Dong IF $offset /= 3 1710c1bc742181ded4930842b46e9507372f0b1b963James Dong LDM $pSrc, {$word0, $word1, $word2} 1720c1bc742181ded4930842b46e9507372f0b1b963James Dong ELSE 1730c1bc742181ded4930842b46e9507372f0b1b963James Dong LDM $pSrc, {$word0, $word2, $word3} 1740c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF 1750c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD $pSrc, $pSrc, $srcStep 1760c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 1770c1bc742181ded4930842b46e9507372f0b1b963James Dong 1780c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 1790c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Description: 1800c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Extract four registers of four pixels for X interpolation 1810c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1820c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Syntax: 1830c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3 1840c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1850c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Inputs: 1860c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $offset Difference of source data location to the source pointer 1870c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Use when $offset != 0 (unaligned load) 1880c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1890c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $word0, $word1, $word2, $word3 1900c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Three of these are inputs based on the $offset parameter. 1910c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// The inputs are specifically selected to be processed by 1920c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// the M_EXT_XINT macro. 1930c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1940c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// ------------------------------------------------------ 1950c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// | Offset | Aligned Ptr | word0 | word1 | word2 | word3 | 1960c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// |------------------------------------------------------| 1970c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// | 0 | 0 | 0123 | 4567 | 8xxx | yyyy | 1980c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// | 1 | -1 | x012 | 3456 | 78xx | yyyy | 1990c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// | 2 | -2 | xx01 | 2345 | 678x | yyyy | 2000c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// | 3 | -3 | xxx0 | yyyy | 1234 | 5678 | 2010c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// ------------------------------------------------------ 2020c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 2030c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Outputs: 2040c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $word0, $word1, $word2, $word3 2050c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Bytes from the original source pointer (not truncated for 2060c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 4 byte alignment) as shown in the table. 2070c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// ------------------------------- 2080c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// | word0 | word1 | word2 | word3 | 2090c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// |-------------------------------| 2100c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// | 0123 | 4567 | 1234 | 5678 | 2110c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// ------------------------------- 2120c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 2130c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending 2140c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// register numbering 2150c1bc742181ded4930842b46e9507372f0b1b963James Dong 2160c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 2170c1bc742181ded4930842b46e9507372f0b1b963James Dong M_EXT_XINT $offset, $word0, $word1, $word2, $word3 2180c1bc742181ded4930842b46e9507372f0b1b963James Dong IF $offset = 0 2190c1bc742181ded4930842b46e9507372f0b1b963James Dong ; $word0 and $word1 are ok 2200c1bc742181ded4930842b46e9507372f0b1b963James Dong ; $word2, $word3 are just 8 shifted versions 2210c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV $word3, $word1, LSR #8 2220c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR $word3, $word3, $word2, LSL #24 2230c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV $word2, $word0, LSR #8 2240c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR $word2, $word2, $word1, LSL #24 2250c1bc742181ded4930842b46e9507372f0b1b963James Dong ELIF $offset = 3 2260c1bc742181ded4930842b46e9507372f0b1b963James Dong ; $word2 and $word3 are ok (taken care while loading itself) 2270c1bc742181ded4930842b46e9507372f0b1b963James Dong ; set $word0 & $word1 2280c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV $word0, $word0, LSR #24 2290c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR $word0, $word0, $word2, LSL #8 2300c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV $word1, $word2, LSR #24 2310c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR $word1, $word1, $word3, LSL #8 2320c1bc742181ded4930842b46e9507372f0b1b963James Dong ELSE 2330c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV $word0, $word0, LSR #8 * $offset 2340c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR $word0, $word0, $word1, LSL #(32 - 8 * ($offset)) 2350c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV $word1, $word1, LSR #8 * $offset 2360c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR $word1, $word1, $word2, LSL #(32 - 8 * ($offset)) 2370c1bc742181ded4930842b46e9507372f0b1b963James Dong 2380c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV $word3, $word1, LSR #8 2390c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1)) 2400c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV $word2, $word0, LSR #8 2410c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR $word2, $word2, $word1, LSL #24 2420c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF 2430c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 2440c1bc742181ded4930842b46e9507372f0b1b963James Dong 2450c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 2460c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Description: 2470c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Computes half-sum and xor of two inputs and puts them in the input 2480c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// registers in that order 2490c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 2500c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Syntax: 2510c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_HSUM_XOR $v0, $v1, $tmp 2520c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 2530c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Inputs: 2540c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $v0 a, first input 2550c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $v1 b, second input 2560c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $tmp scratch register 2570c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 2580c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Outputs: 2590c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $v0 (a + b)/2 2600c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $v1 a ^ b 2610c1bc742181ded4930842b46e9507372f0b1b963James Dong 2620c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 2630c1bc742181ded4930842b46e9507372f0b1b963James Dong M_HSUM_XOR $v0, $v1, $tmp 2640c1bc742181ded4930842b46e9507372f0b1b963James Dong UHADD8 $tmp, $v0, $v1 ;// s0 = a + b 2650c1bc742181ded4930842b46e9507372f0b1b963James Dong EOR $v1, $v0, $v1 ;// l0 = a ^ b 2660c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV $v0, $tmp ;// s0 2670c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 2680c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 2690c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Description: 2700c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in 2710c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// mcReconBlock module. Very specific to the implementation of 2720c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and 2730c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are 2740c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// not significant and are used by the callee for row counter (y) 2750c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 2760c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Some points to note are: 2770c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1. Input is pair of pair-averages and Xors 2780c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another 2790c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// running average 2800c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 3. Output is in the first argument 2810c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 2820c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Syntax: 2830c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_AVG4 $sum0, $lsb0, $sum1, $lsb1, $rndVal 2840c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 2850c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Inputs: 2860c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $sum0 (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged 2870c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $lsb0 (a ^ b) 2880c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $sum1 (c + d) >> 1. Not modified 2890c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $lsb1 (c ^ d) Not modified 2900c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $rndVal Assembler Variable. 0 for rounding, 1 for no rounding 2910c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 2920c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Outputs: 2930c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $sum0 (a + b + c + d + 1) / 4 : If no rounding 2940c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// (a + b + c + d + 2) / 4 : If rounding 2950c1bc742181ded4930842b46e9507372f0b1b963James Dong 2960c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 2970c1bc742181ded4930842b46e9507372f0b1b963James Dong M_AVG4 $sum0, $lsb0, $sum1, $lsb1, $rndVal 2980c1bc742181ded4930842b46e9507372f0b1b963James Dong LCLS OP1 2990c1bc742181ded4930842b46e9507372f0b1b963James Dong LCLS OP2 3000c1bc742181ded4930842b46e9507372f0b1b963James Dong IF $rndVal = 0 ;// rounding case 3010c1bc742181ded4930842b46e9507372f0b1b963James DongOP1 SETS "AND" 3020c1bc742181ded4930842b46e9507372f0b1b963James DongOP2 SETS "ORR" 3030c1bc742181ded4930842b46e9507372f0b1b963James Dong ELSE ;// Not rounding case 3040c1bc742181ded4930842b46e9507372f0b1b963James DongOP1 SETS "ORR" 3050c1bc742181ded4930842b46e9507372f0b1b963James DongOP2 SETS "AND" 3060c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF 3070c1bc742181ded4930842b46e9507372f0b1b963James Dong 3080c1bc742181ded4930842b46e9507372f0b1b963James Dong LCLS lsb2 3090c1bc742181ded4930842b46e9507372f0b1b963James Dong LCLS sum2 3100c1bc742181ded4930842b46e9507372f0b1b963James Dong LCLS dest 3110c1bc742181ded4930842b46e9507372f0b1b963James Dong 3120c1bc742181ded4930842b46e9507372f0b1b963James Donglsb2 SETS "tmp" 3130c1bc742181ded4930842b46e9507372f0b1b963James Dongsum2 SETS "$lsb0" 3140c1bc742181ded4930842b46e9507372f0b1b963James Dongdest SETS "$sum0" 3150c1bc742181ded4930842b46e9507372f0b1b963James Dong 3160c1bc742181ded4930842b46e9507372f0b1b963James Dong $OP1 $lsb0, $lsb0, $lsb1 ;// e0 = e0 & e1 3170c1bc742181ded4930842b46e9507372f0b1b963James Dong EOR $lsb2, $sum0, $sum1 ;// e2 = s0 ^ s1 3180c1bc742181ded4930842b46e9507372f0b1b963James Dong $OP2 $lsb2, $lsb2, $lsb0 ;// e2 = e2 | e0 3190c1bc742181ded4930842b46e9507372f0b1b963James Dong AND $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask 3200c1bc742181ded4930842b46e9507372f0b1b963James Dong UHADD8 $sum2, $sum0, $sum1 ;// s2 = (s0 + s1)/2 3210c1bc742181ded4930842b46e9507372f0b1b963James Dong UADD8 $dest, $sum2, $lsb2 ;// dest = s2 + e2 3220c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 3230c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 3240c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Motion compensation handler macros 3250c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 3260c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Description: 3270c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Implement motion compensation routines using the named registers in 3280c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// callee function. Each of the following 4 implement the 4 predict type 3290c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Each handles 8 cases each ie all the combinations of 4 types of source 3300c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// alignment offsets and 2 types of rounding flag 3310c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 3320c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Syntax: 3330c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset 3340c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_MCRECONBLOCK_HalfPixelX $rndVal, $offset 3350c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_MCRECONBLOCK_HalfPixelY $rndVal, $offset 3360c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset 3370c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 3380c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Inputs: 3390c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $rndVal Assembler Variable. 0 for rounding, 1 for no rounding 3400c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $offset $pSrc MOD 4 value. Offset from 4 byte aligned location. 3410c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 3420c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Outputs: 3430c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Outputs come in the named registers of the callee functions 3440c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// The macro loads the data from the source pointer, processes it and 3450c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// stores in the destination pointer. Does the whole prediction cycle 3460c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// of Motion Compensation routine for a particular predictType 3470c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// After this only residue addition to the predicted values remain 3480c1bc742181ded4930842b46e9507372f0b1b963James Dong 3490c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 3500c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK_IntegerPixel $rndVal, $offset 3510c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Algorithmic Description: 3520c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// This handles motion compensation for IntegerPixel predictType. Both 3530c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// rounding cases are handled by the same code base. It is just a copy 3540c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// from source to destination. Two lines are done per loop to reduce 3550c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// stalls. Loop has been software pipelined as well for that purpose. 3560c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 3570c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_LOAD_X loads a whole row in two registers and then they are stored 3580c1bc742181ded4930842b46e9507372f0b1b963James Dong 3590c1bc742181ded4930842b46e9507372f0b1b963James DongCaseIntegerPixelRnd0Offset$offset 3600c1bc742181ded4930842b46e9507372f0b1b963James DongCaseIntegerPixelRnd1Offset$offset 3610c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp3, $offset 3620c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset 3630c1bc742181ded4930842b46e9507372f0b1b963James DongYloopIntegerPixelOffset$offset 3640c1bc742181ded4930842b46e9507372f0b1b963James Dong SUBS y, y, #2 3650c1bc742181ded4930842b46e9507372f0b1b963James Dong STRD tmp1, tmp2, [pDst], dstStep 3660c1bc742181ded4930842b46e9507372f0b1b963James Dong STRD tmp3, tmp4, [pDst], dstStep 3670c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp3, $offset 3680c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset 3690c1bc742181ded4930842b46e9507372f0b1b963James Dong BGT YloopIntegerPixelOffset$offset 3700c1bc742181ded4930842b46e9507372f0b1b963James Dong 3710c1bc742181ded4930842b46e9507372f0b1b963James Dong B SwitchPredictTypeEnd 3720c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 3730c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 3740c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 3750c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK_HalfPixelX $rndVal, $offset 3760c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Algorithmic Description: 3770c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// This handles motion compensation for HalfPixelX predictType. The two 3780c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// rounding cases are handled by the different code base and spanned by 3790c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// different macro calls. Loop has been software pipelined to reduce 3800c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// stalls. 3810c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 3820c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Filtering involves averaging a pixel with the next horizontal pixel. 3830c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with 3840c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// all pixels in a row with 4 pixel in each register and another 2 3850c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// registers with pixels corresponding to one horizontally shifted pixel 3860c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// corresponding to the initial row pixels. These are set of packed 3870c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// registers appropriate to do 4 lane SIMD. 3880c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// After that M_UHADD8R macro does the averaging taking care of the 3890c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// rounding as required 3900c1bc742181ded4930842b46e9507372f0b1b963James Dong 3910c1bc742181ded4930842b46e9507372f0b1b963James DongCaseHalfPixelXRnd$rndVal.Offset$offset 3920c1bc742181ded4930842b46e9507372f0b1b963James Dong IF $rndVal = 0 3930c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR mask, =0x80808080 3940c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF 3950c1bc742181ded4930842b46e9507372f0b1b963James Dong 3960c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4 3970c1bc742181ded4930842b46e9507372f0b1b963James DongYloopHalfPixelXRnd$rndVal.Offset$offset 3980c1bc742181ded4930842b46e9507372f0b1b963James Dong SUBS y, y, #1 3990c1bc742181ded4930842b46e9507372f0b1b963James Dong M_EXT_XINT $offset, tmp1, tmp2, tmp3, tmp4 4000c1bc742181ded4930842b46e9507372f0b1b963James Dong M_UHADD8R tmp5, tmp1, tmp3, (1-$rndVal), mask 4010c1bc742181ded4930842b46e9507372f0b1b963James Dong M_UHADD8R tmp6, tmp2, tmp4, (1-$rndVal), mask 4020c1bc742181ded4930842b46e9507372f0b1b963James Dong STRD tmp5, tmp6, [pDst], dstStep 4030c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4 4040c1bc742181ded4930842b46e9507372f0b1b963James Dong BGT YloopHalfPixelXRnd$rndVal.Offset$offset 4050c1bc742181ded4930842b46e9507372f0b1b963James Dong 4060c1bc742181ded4930842b46e9507372f0b1b963James Dong B SwitchPredictTypeEnd 4070c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 4080c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 4090c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 4100c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK_HalfPixelY $rndVal, $offset 4110c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Algorithmic Description: 4120c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// This handles motion compensation for HalfPixelY predictType. The two 4130c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// rounding cases are handled by the different code base and spanned by 4140c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// different macro calls. PreLoading is used to avoid reload of same data. 4150c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 4160c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Filtering involves averaging a pixel with the next vertical pixel. 4170c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in 4180c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// each register. These are set of packed registers appropriate to do 4190c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care 4200c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// of the rounding as required 4210c1bc742181ded4930842b46e9507372f0b1b963James Dong 4220c1bc742181ded4930842b46e9507372f0b1b963James DongCaseHalfPixelYRnd$rndVal.Offset$offset 4230c1bc742181ded4930842b46e9507372f0b1b963James Dong IF $rndVal = 0 4240c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR mask, =0x80808080 4250c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF 4260c1bc742181ded4930842b46e9507372f0b1b963James Dong 4270c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load 4280c1bc742181ded4930842b46e9507372f0b1b963James DongYloopHalfPixelYRnd$rndVal.Offset$offset 4290c1bc742181ded4930842b46e9507372f0b1b963James Dong SUBS y, y, #2 4300c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Processing one line 4310c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset 4320c1bc742181ded4930842b46e9507372f0b1b963James Dong M_UHADD8R tmp1, tmp1, tmp3, (1-$rndVal), mask 4330c1bc742181ded4930842b46e9507372f0b1b963James Dong M_UHADD8R tmp2, tmp2, tmp4, (1-$rndVal), mask 4340c1bc742181ded4930842b46e9507372f0b1b963James Dong STRD tmp1, tmp2, [pDst], dstStep 4350c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Processing another line 4360c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp5, $offset 4370c1bc742181ded4930842b46e9507372f0b1b963James Dong M_UHADD8R tmp3, tmp3, tmp1, (1-$rndVal), mask 4380c1bc742181ded4930842b46e9507372f0b1b963James Dong M_UHADD8R tmp4, tmp4, tmp2, (1-$rndVal), mask 4390c1bc742181ded4930842b46e9507372f0b1b963James Dong STRD tmp3, tmp4, [pDst], dstStep 4400c1bc742181ded4930842b46e9507372f0b1b963James Dong 4410c1bc742181ded4930842b46e9507372f0b1b963James Dong BGT YloopHalfPixelYRnd$rndVal.Offset$offset 4420c1bc742181ded4930842b46e9507372f0b1b963James Dong 4430c1bc742181ded4930842b46e9507372f0b1b963James Dong B SwitchPredictTypeEnd 4440c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 4450c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 4460c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 4470c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset 4480c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Algorithmic Description: 4490c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// This handles motion compensation for HalfPixelXY predictType. The two 4500c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// rounding cases are handled by the different code base and spanned by 4510c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// different macro calls. PreLoading is used to avoid reload of same data. 4520c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 4530c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Filtering involves averaging a pixel with the next vertical, horizontal 4540c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT 4550c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// and M_EXT_XINT combination generates 4 registers with a row and its 4560c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 1 pixel right shifted version, with 4 pixels in one register. Another 4570c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// call of that macro-combination gets another row. Then M_HSUM_XOR is 4580c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// called to get mutual half-sum and xor combinations of a row with its 4590c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// shifted version as they are inputs to the M_AVG4 macro which computes 4600c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// the 4 element average with rounding. Note that it is the half-sum/xor 4610c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// values that are preserved for next row as they can be re-used in the 4620c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// next call to the M_AVG4 and saves recomputation. 4630c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Due to lack of register, the row counter and a masking value required 4640c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// in M_AVG4 are packed into a single register yMask where the last nibble 4650c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// holds the row counter values and rest holds the masking variable left 4660c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// shifted by 4 4670c1bc742181ded4930842b46e9507372f0b1b963James Dong 4680c1bc742181ded4930842b46e9507372f0b1b963James DongCaseHalfPixelXYRnd$rndVal.Offset$offset 4690c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR yMask, =((0x01010101 << 4) + 8) 4700c1bc742181ded4930842b46e9507372f0b1b963James Dong 4710c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b' 4720c1bc742181ded4930842b46e9507372f0b1b963James Dong M_EXT_XINT $offset, t00, t01, t10, t11 4730c1bc742181ded4930842b46e9507372f0b1b963James Dong M_HSUM_XOR t00, t10, tmp ;// s0, l0 4740c1bc742181ded4930842b46e9507372f0b1b963James Dong M_HSUM_XOR t01, t11, tmp ;// s0', l0' 4750c1bc742181ded4930842b46e9507372f0b1b963James Dong 4760c1bc742181ded4930842b46e9507372f0b1b963James DongYloopHalfPixelXYRnd$rndVal.Offset$offset 4770c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Processsing one line 4780c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// t00, t01, t10, t11 required from previous loop 4790c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d' 4800c1bc742181ded4930842b46e9507372f0b1b963James Dong SUB yMask, yMask, #2 4810c1bc742181ded4930842b46e9507372f0b1b963James Dong M_EXT_XINT $offset, t20, t21, t30, t31 4820c1bc742181ded4930842b46e9507372f0b1b963James Dong M_HSUM_XOR t20, t30, tmp ;// s1, l1 4830c1bc742181ded4930842b46e9507372f0b1b963James Dong M_HSUM_XOR t21, t31, tmp ;// s1', l1' 4840c1bc742181ded4930842b46e9507372f0b1b963James Dong M_AVG4 t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1 4850c1bc742181ded4930842b46e9507372f0b1b963James Dong M_AVG4 t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1' 4860c1bc742181ded4930842b46e9507372f0b1b963James Dong STRD t00, t01, [pDst], dstStep ;// store the average 4870c1bc742181ded4930842b46e9507372f0b1b963James Dong 4880c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Processsing another line 4890c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// t20, t21, t30, t31 required from above 4900c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b' 4910c1bc742181ded4930842b46e9507372f0b1b963James Dong TST yMask, #7 4920c1bc742181ded4930842b46e9507372f0b1b963James Dong M_EXT_XINT $offset, t00, t01, t10, t11 4930c1bc742181ded4930842b46e9507372f0b1b963James Dong M_HSUM_XOR t00, t10, tmp 4940c1bc742181ded4930842b46e9507372f0b1b963James Dong M_HSUM_XOR t01, t11, tmp 4950c1bc742181ded4930842b46e9507372f0b1b963James Dong M_AVG4 t20, t30, t00, t10, $rndVal 4960c1bc742181ded4930842b46e9507372f0b1b963James Dong M_AVG4 t21, t31, t01, t11, $rndVal 4970c1bc742181ded4930842b46e9507372f0b1b963James Dong STRD t20, t21, [pDst], dstStep 4980c1bc742181ded4930842b46e9507372f0b1b963James Dong 4990c1bc742181ded4930842b46e9507372f0b1b963James Dong BGT YloopHalfPixelXYRnd$rndVal.Offset$offset 5000c1bc742181ded4930842b46e9507372f0b1b963James Dong 5010c1bc742181ded4930842b46e9507372f0b1b963James Dong IF $offset/=3 :LOR: $rndVal/=1 5020c1bc742181ded4930842b46e9507372f0b1b963James Dong B SwitchPredictTypeEnd 5030c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF 5040c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 5050c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 5060c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Motion compensation handler macros end here 5070c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 5080c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Description: 5090c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal 5100c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// combination in the "switch" to prediction processing code segment 5110c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 5120c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Syntax: 5130c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_CASE_OFFSET $rnd, $predictType 5140c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 5150c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Inputs: 5160c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $rnd 0 for rounding, 1 for no rounding 5170c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $predictType The prediction mode 5180c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 5190c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Outputs: 5200c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Populated list of "M_CASE"s for the "M_SWITCH" macro 5210c1bc742181ded4930842b46e9507372f0b1b963James Dong 5220c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 5230c1bc742181ded4930842b46e9507372f0b1b963James Dong M_CASE_OFFSET $rnd, $predictType 5240c1bc742181ded4930842b46e9507372f0b1b963James Dong M_CASE Case$predictType.Rnd$rnd.Offset0 5250c1bc742181ded4930842b46e9507372f0b1b963James Dong M_CASE Case$predictType.Rnd$rnd.Offset1 5260c1bc742181ded4930842b46e9507372f0b1b963James Dong M_CASE Case$predictType.Rnd$rnd.Offset2 5270c1bc742181ded4930842b46e9507372f0b1b963James Dong M_CASE Case$predictType.Rnd$rnd.Offset3 5280c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 5290c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 5300c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Description: 5310c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Populates all 2 kinds of rounding "cases" for each predictType in the 5320c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// "switch" to prediction processing code segment 5330c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 5340c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Syntax: 5350c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_CASE_OFFSET $predictType 5360c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 5370c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Inputs: 5380c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $predictType The prediction mode 5390c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 5400c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Outputs: 5410c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Populated list of "M_CASE_OFFSET" macros 5420c1bc742181ded4930842b46e9507372f0b1b963James Dong 5430c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 5440c1bc742181ded4930842b46e9507372f0b1b963James Dong M_CASE_MCRECONBLOCK $predictType 5450c1bc742181ded4930842b46e9507372f0b1b963James Dong M_CASE_OFFSET 0, $predictType ;// 0 for rounding 5460c1bc742181ded4930842b46e9507372f0b1b963James Dong M_CASE_OFFSET 1, $predictType ;// 1 for no rounding 5470c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 5480c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 5490c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Description: 5500c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Populates all 8 kinds of rounding and offset combinations handling macros 5510c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// for the specified predictType. In case of "IntegerPixel" predictType, 5520c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// rounding is not required so same code segment handles both cases 5530c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 5540c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Syntax: 5550c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_MCRECONBLOCK $predictType 5560c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 5570c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Inputs: 5580c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// $predictType The prediction mode 5590c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 5600c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Outputs: 5610c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified 5620c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// predictType. Each 5630c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// M_MCRECONBLOCK_<predictType> $rnd, $offset 5640c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// is an code segment (starting with a label indicating the predictType, 5650c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// rounding and offset combination) 5660c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Four calls of this macro with the 4 prediction modes populate all the 32 5670c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// handlers 5680c1bc742181ded4930842b46e9507372f0b1b963James Dong 5690c1bc742181ded4930842b46e9507372f0b1b963James Dong MACRO 5700c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK $predictType 5710c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK_$predictType 0, 0 5720c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK_$predictType 0, 1 5730c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK_$predictType 0, 2 5740c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK_$predictType 0, 3 5750c1bc742181ded4930842b46e9507372f0b1b963James Dong IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference 5760c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK_$predictType 1, 0 5770c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK_$predictType 1, 1 5780c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK_$predictType 1, 2 5790c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK_$predictType 1, 3 5800c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF 5810c1bc742181ded4930842b46e9507372f0b1b963James Dong MEND 5820c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 5830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Input/Output Registers 5840c1bc742181ded4930842b46e9507372f0b1b963James DongpSrc RN 0 5850c1bc742181ded4930842b46e9507372f0b1b963James DongsrcStep RN 1 5860c1bc742181ded4930842b46e9507372f0b1b963James Dongarg_pSrcResidue RN 2 5870c1bc742181ded4930842b46e9507372f0b1b963James DongpSrcResidue RN 12 5880c1bc742181ded4930842b46e9507372f0b1b963James DongpDst RN 3 5890c1bc742181ded4930842b46e9507372f0b1b963James DongdstStep RN 2 5900c1bc742181ded4930842b46e9507372f0b1b963James DongpredictType RN 10 5910c1bc742181ded4930842b46e9507372f0b1b963James DongrndVal RN 11 5920c1bc742181ded4930842b46e9507372f0b1b963James Dongmask RN 11 5930c1bc742181ded4930842b46e9507372f0b1b963James Dong 5940c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Local Scratch Registers 5950c1bc742181ded4930842b46e9507372f0b1b963James Dongzero RN 12 5960c1bc742181ded4930842b46e9507372f0b1b963James Dongy RN 14 5970c1bc742181ded4930842b46e9507372f0b1b963James Dong 5980c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp1 RN 4 5990c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp2 RN 5 6000c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp3 RN 6 6010c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp4 RN 7 6020c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp5 RN 8 6030c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp6 RN 9 6040c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp7 RN 10 6050c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp8 RN 11 6060c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp9 RN 12 6070c1bc742181ded4930842b46e9507372f0b1b963James Dong 6080c1bc742181ded4930842b46e9507372f0b1b963James Dongt00 RN 4 6090c1bc742181ded4930842b46e9507372f0b1b963James Dongt01 RN 5 6100c1bc742181ded4930842b46e9507372f0b1b963James Dongt10 RN 6 6110c1bc742181ded4930842b46e9507372f0b1b963James Dongt11 RN 7 6120c1bc742181ded4930842b46e9507372f0b1b963James Dongt20 RN 8 6130c1bc742181ded4930842b46e9507372f0b1b963James Dongt21 RN 9 6140c1bc742181ded4930842b46e9507372f0b1b963James Dongt30 RN 10 6150c1bc742181ded4930842b46e9507372f0b1b963James Dongt31 RN 11 6160c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp RN 12 6170c1bc742181ded4930842b46e9507372f0b1b963James Dong 6180c1bc742181ded4930842b46e9507372f0b1b963James DongyMask RN 14 6190c1bc742181ded4930842b46e9507372f0b1b963James Dong 6200c1bc742181ded4930842b46e9507372f0b1b963James Dongdst RN 1 6210c1bc742181ded4930842b46e9507372f0b1b963James Dongreturn RN 0 6220c1bc742181ded4930842b46e9507372f0b1b963James Dong 6230c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Allocate memory on stack 6240c1bc742181ded4930842b46e9507372f0b1b963James Dong M_ALLOC4 Stk_pDst, 4 6250c1bc742181ded4930842b46e9507372f0b1b963James Dong M_ALLOC4 Stk_pSrcResidue, 4 6260c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Function header 6270c1bc742181ded4930842b46e9507372f0b1b963James Dong M_START omxVCM4P2_MCReconBlock, r11 6280c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Define stack arguments 6290c1bc742181ded4930842b46e9507372f0b1b963James Dong M_ARG Arg_dstStep, 4 6300c1bc742181ded4930842b46e9507372f0b1b963James Dong M_ARG Arg_predictType, 4 6310c1bc742181ded4930842b46e9507372f0b1b963James Dong M_ARG Arg_rndVal, 4 6320c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Save on stack 6330c1bc742181ded4930842b46e9507372f0b1b963James Dong M_STR pDst, Stk_pDst 6340c1bc742181ded4930842b46e9507372f0b1b963James Dong M_STR arg_pSrcResidue, Stk_pSrcResidue 6350c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Load argument from the stack 6360c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LDR dstStep, Arg_dstStep 6370c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LDR predictType, Arg_predictType 6380c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LDR rndVal, Arg_rndVal 6390c1bc742181ded4930842b46e9507372f0b1b963James Dong 6400c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV y, #8 6410c1bc742181ded4930842b46e9507372f0b1b963James Dong 6420c1bc742181ded4930842b46e9507372f0b1b963James Dong AND tmp1, pSrc, #3 6430c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR predictType, tmp1, predictType, LSL #3 6440c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR predictType, predictType, rndVal, LSL #2 6450c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Truncating source pointer to align to 4 byte location 6460c1bc742181ded4930842b46e9507372f0b1b963James Dong BIC pSrc, pSrc, #3 6470c1bc742181ded4930842b46e9507372f0b1b963James Dong 6480c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Implementation takes care of all combinations of different 6490c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// predictTypes, rounding cases and source pointer offsets to alignment 6500c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// of 4 bytes in different code bases unless one of these parameter wasn't 6510c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK 6520c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// macros branch into 8 M_CASE macros for all combinations of the 2 6530c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte 6540c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// alignment. 6550c1bc742181ded4930842b46e9507372f0b1b963James Dong M_SWITCH predictType 6560c1bc742181ded4930842b46e9507372f0b1b963James Dong M_CASE_MCRECONBLOCK IntegerPixel 6570c1bc742181ded4930842b46e9507372f0b1b963James Dong M_CASE_MCRECONBLOCK HalfPixelX 6580c1bc742181ded4930842b46e9507372f0b1b963James Dong M_CASE_MCRECONBLOCK HalfPixelY 6590c1bc742181ded4930842b46e9507372f0b1b963James Dong M_CASE_MCRECONBLOCK HalfPixelXY 6600c1bc742181ded4930842b46e9507372f0b1b963James Dong M_ENDSWITCH 6610c1bc742181ded4930842b46e9507372f0b1b963James Dong 6620c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8 6630c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// particular macros (4 in case of IntegerPixel as rounding makes no 6640c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// difference there) to generate the code for all cases of rounding and 6650c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// offsets. LTORG is used to segment the code as code size bloated beyond 6660c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 4KB. 6670c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK IntegerPixel 6680c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK HalfPixelX 6690c1bc742181ded4930842b46e9507372f0b1b963James Dong LTORG 6700c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK HalfPixelY 6710c1bc742181ded4930842b46e9507372f0b1b963James Dong M_MCRECONBLOCK HalfPixelXY 6720c1bc742181ded4930842b46e9507372f0b1b963James DongSwitchPredictTypeEnd 6730c1bc742181ded4930842b46e9507372f0b1b963James Dong 6740c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// Residue Addition 6750c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// This is done in 2 lane SIMD though loads are further optimized and 6760c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// 4 bytes are loaded in case of destination buffer. Algorithmic 6770c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// details are in inlined comments 6780c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LDR pSrcResidue, Stk_pSrcResidue 6790c1bc742181ded4930842b46e9507372f0b1b963James Dong CMP pSrcResidue, #0 6800c1bc742181ded4930842b46e9507372f0b1b963James Dong BEQ pSrcResidueConditionEnd 6810c1bc742181ded4930842b46e9507372f0b1b963James DongpSrcResidueNotNull 6820c1bc742181ded4930842b46e9507372f0b1b963James Dong M_LDR pDst, Stk_pDst 6830c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV y, #8 6840c1bc742181ded4930842b46e9507372f0b1b963James Dong SUB dstStep, dstStep, #4 6850c1bc742181ded4930842b46e9507372f0b1b963James DongYloop_pSrcResidueNotNull 6860c1bc742181ded4930842b46e9507372f0b1b963James Dong SUBS y, y, #1 6870c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR dst, [pDst] ;// dst = [dcba] 6880c1bc742181ded4930842b46e9507372f0b1b963James Dong LDMIA pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA] 6890c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT tmp3, tmp1, tmp2, LSL #16 ;// Deltaval1 = [C A] 6900c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB tmp4, tmp2, tmp1, ASR #16 ;// DeltaVal2 = [D B] 6910c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmp1, dst ;// tmp1 = [0c0a] 6920c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmp2, dst, ROR #8 ;// tmp2 = [0d0b] 6930c1bc742181ded4930842b46e9507372f0b1b963James Dong QADD16 tmp1, tmp1, tmp3 ;// Add and saturate to 16 bits 6940c1bc742181ded4930842b46e9507372f0b1b963James Dong QADD16 tmp2, tmp2, tmp4 6950c1bc742181ded4930842b46e9507372f0b1b963James Dong USAT16 tmp1, #8, tmp1 6960c1bc742181ded4930842b46e9507372f0b1b963James Dong USAT16 tmp2, #8, tmp2 ;// armClip(0, 255, tmp2) 6970c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR tmp1, tmp1, tmp2, LSL #8 ;// tmp1 = [dcba] 6980c1bc742181ded4930842b46e9507372f0b1b963James Dong STR tmp1, [pDst], #4 6990c1bc742181ded4930842b46e9507372f0b1b963James Dong 7000c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR dst, [pDst] 7010c1bc742181ded4930842b46e9507372f0b1b963James Dong LDMIA pSrcResidue!, {tmp1, tmp2} 7020c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHBT tmp3, tmp1, tmp2, LSL #16 7030c1bc742181ded4930842b46e9507372f0b1b963James Dong PKHTB tmp4, tmp2, tmp1, ASR #16 7040c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmp1, dst 7050c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmp2, dst, ROR #8 7060c1bc742181ded4930842b46e9507372f0b1b963James Dong QADD16 tmp1, tmp1, tmp3 7070c1bc742181ded4930842b46e9507372f0b1b963James Dong QADD16 tmp2, tmp2, tmp4 7080c1bc742181ded4930842b46e9507372f0b1b963James Dong USAT16 tmp1, #8, tmp1 7090c1bc742181ded4930842b46e9507372f0b1b963James Dong USAT16 tmp2, #8, tmp2 7100c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR tmp1, tmp1, tmp2, LSL #8 7110c1bc742181ded4930842b46e9507372f0b1b963James Dong STR tmp1, [pDst], dstStep 7120c1bc742181ded4930842b46e9507372f0b1b963James Dong 7130c1bc742181ded4930842b46e9507372f0b1b963James Dong BGT Yloop_pSrcResidueNotNull 7140c1bc742181ded4930842b46e9507372f0b1b963James DongpSrcResidueConditionEnd 7150c1bc742181ded4930842b46e9507372f0b1b963James Dong 7160c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV return, #OMX_Sts_NoErr 7170c1bc742181ded4930842b46e9507372f0b1b963James Dong 7180c1bc742181ded4930842b46e9507372f0b1b963James Dong M_END 7190c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF ;// ARM1136JS 7200c1bc742181ded4930842b46e9507372f0b1b963James Dong 7210c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 7220c1bc742181ded4930842b46e9507372f0b1b963James Dong;// CortexA8 implementation 7230c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 7240c1bc742181ded4930842b46e9507372f0b1b963James Dong END 7250c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 7260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// omxVCM4P2_MCReconBlock ends 7270c1bc742181ded4930842b46e9507372f0b1b963James Dong;// *************************************************************************** 728