10c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Copyright (C) 2007-2008 ARM Limited
378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Licensed under the Apache License, Version 2.0 (the "License");
578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// you may not use this file except in compliance with the License.
678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// You may obtain a copy of the License at
778e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
878e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//      http://www.apache.org/licenses/LICENSE-2.0
978e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
1078e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Unless required by applicable law or agreed to in writing, software
1178e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// distributed under the License is distributed on an "AS IS" BASIS,
1278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// See the License for the specific language governing permissions and
1478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// limitations under the License.
1578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
1678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
170c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// File Name:  omxVCM4P2_MCReconBlock_s.s
190c1bc742181ded4930842b46e9507372f0b1b963James Dong;// OpenMAX DL: v1.0.2
200c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Revision:   9641
210c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Date:       Thursday, February 7, 2008
220c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
230c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
240c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
250c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Description:
270c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
280c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
290c1bc742181ded4930842b46e9507372f0b1b963James Dong
300c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Include standard headers
310c1bc742181ded4930842b46e9507372f0b1b963James Dong    INCLUDE omxtypes_s.h
320c1bc742181ded4930842b46e9507372f0b1b963James Dong    INCLUDE armCOMM_s.h
330c1bc742181ded4930842b46e9507372f0b1b963James Dong
340c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Import symbols required from other files
350c1bc742181ded4930842b46e9507372f0b1b963James Dong
360c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_VARIANTS ARM1136JS
370c1bc742181ded4930842b46e9507372f0b1b963James Dong
380c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
390c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ARM1136JS implementation
400c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
410c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF  ARM1136JS
420c1bc742181ded4930842b46e9507372f0b1b963James Dong
430c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
440c1bc742181ded4930842b46e9507372f0b1b963James Dong;// MACRO DEFINITIONS
450c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
460c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Description:
470c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
480c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//   dest[j] = (x[j] + y[j] + round) >> 1,   j=0..3
490c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
500c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to
510c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// each sum before dividing by two, if round is 1
520c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
530c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Syntax:
540c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_UHADD8R   $dest, $x, $y, $round, $mask
550c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
560c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Inputs:
570c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $x        four packed bytes,   x[3] :  x[2]  :  x[1]  :  x[0]
580c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $y        four packed bytes,   y[3] :  y[2]  :  y[1]  :  y[0]
590c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $round    0 if no rounding to be added, 1 if rounding to be done
600c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $mask     some register set to 0x80808080
610c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
620c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Outputs:
630c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $dest     four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
640c1bc742181ded4930842b46e9507372f0b1b963James Dong
650c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
660c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_UHADD8R   $dest, $x, $y, $round, $mask
670c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF $round = 1
680c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF  $dest /= $y
690c1bc742181ded4930842b46e9507372f0b1b963James Dong            MVN         $dest, $x
700c1bc742181ded4930842b46e9507372f0b1b963James Dong            UHSUB8      $dest, $y, $dest
710c1bc742181ded4930842b46e9507372f0b1b963James Dong            EOR         $dest, $dest, $mask
720c1bc742181ded4930842b46e9507372f0b1b963James Dong        ELSE
730c1bc742181ded4930842b46e9507372f0b1b963James Dong            MVN         $dest, $y
740c1bc742181ded4930842b46e9507372f0b1b963James Dong            UHSUB8      $dest, $x, $dest
750c1bc742181ded4930842b46e9507372f0b1b963James Dong            EOR         $dest, $dest, $mask
760c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
770c1bc742181ded4930842b46e9507372f0b1b963James Dong    ELSE
780c1bc742181ded4930842b46e9507372f0b1b963James Dong        UHADD8      $dest, $x, $y
790c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF
800c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
810c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
820c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Description:
830c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Load 8 bytes from $pSrc (aligned or unaligned locations)
840c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
850c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Syntax:
860c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
870c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
880c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Inputs:
890c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $pSrc       4 byte aligned source pointer to an address just less than
900c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             or equal to the data location
910c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $srcStep    The stride on source
920c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $scratch    A scratch register, used internally for temp calculations
930c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $offset     Difference of source data location to the source pointer
940c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             Use when $offset != 0 (unaligned load)
950c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
960c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Outputs:
970c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $pSrc       In case the macro accepts stride, it increments the pSrc by
980c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             that value, else unchanged
990c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $out0       four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
1000c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $out1       four packed bytes,   z[7] :  z[6]  :  z[5]  :  z[4]
1010c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1020c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Note: {$out0, $out1, $scratch} should be registers with ascending
1030c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// register numbering. In case offset is 0, $scratch is not modified.
1040c1bc742181ded4930842b46e9507372f0b1b963James Dong
1050c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
1060c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
1070c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF $offset = 0
1080c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDM         $pSrc, {$out0, $out1}
1090c1bc742181ded4930842b46e9507372f0b1b963James Dong            ADD         $pSrc, $pSrc, $srcStep
1100c1bc742181ded4930842b46e9507372f0b1b963James Dong        ELSE
1110c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDM         $pSrc, {$out0, $out1, $scratch}
1120c1bc742181ded4930842b46e9507372f0b1b963James Dong            ADD         $pSrc, $pSrc, $srcStep
1130c1bc742181ded4930842b46e9507372f0b1b963James Dong
1140c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV         $out0, $out0, LSR #8 * $offset
1150c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR         $out0, $out0, $out1, LSL #(32 - 8 * ($offset))
1160c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV         $out1, $out1, LSR #8 * $offset
1170c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR         $out1, $out1, $scratch, LSL #(32 - 8 * ($offset))
1180c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
1190c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
1200c1bc742181ded4930842b46e9507372f0b1b963James Dong
1210c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
1220c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Description:
1230c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Loads three words for X interpolation, update pointer to next row. For
1240c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// X interpolation, given a truncated-4byteAligned source pointer,
1250c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// invariably three continous words are required from there to get the
1260c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// nine bytes from the source pointer for filtering.
1270c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1280c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Syntax:
1290c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Inputs:
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $pSrc       4 byte aligned source pointer to an address just less than
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             or equal to the data location
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $srcStep    The stride on source
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $offset     Difference of source data location to the source pointer
1380c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             Use when $offset != 0 (unaligned load)
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Outputs:
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $pSrc       Incremented by $srcStep
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $word0, $word1, $word2, $word3
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             Three of these are outputs based on the $offset parameter.
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             The outputs are specifically generated to be processed by
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             the M_EXT_XINT macro. Following is the illustration to show
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             how the nine bytes are spanned for different offsets from
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             notTruncatedForAlignmentSourcePointer.
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//              ------------------------------------------------------
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             |------------------------------------------------------|
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             |    0   |       0     | 0123  | 4567  | 8xxx  |       |
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             |    1   |      -1     | x012  | 3456  | 78xx  |       |
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             |    2   |      -2     | xx01  | 2345  | 678x  |       |
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             |    3   |      -3     | xxx0  |       | 1234  | 5678  |
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//              ------------------------------------------------------
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             where the numbering (0-8) is to designate the 9 bytes from
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             start of a particular row. The illustration doesn't take in
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             account the positioning of bytes with in the word and the
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             macro combination with M_EXT_XINT will work only in little
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             endian environs
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1650c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// register numbering
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF $offset /= 3
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDM         $pSrc, {$word0, $word1, $word2}
1720c1bc742181ded4930842b46e9507372f0b1b963James Dong        ELSE
1730c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDM         $pSrc, {$word0, $word2, $word3}
1740c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
1750c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         $pSrc, $pSrc, $srcStep
1760c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
1770c1bc742181ded4930842b46e9507372f0b1b963James Dong
1780c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
1790c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Description:
1800c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Extract four registers of four pixels for X interpolation
1810c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1820c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Syntax:
1830c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3
1840c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1850c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Inputs:
1860c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $offset     Difference of source data location to the source pointer
1870c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             Use when $offset != 0 (unaligned load)
1880c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1890c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $word0, $word1, $word2, $word3
1900c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             Three of these are inputs based on the $offset parameter.
1910c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             The inputs are specifically selected to be processed by
1920c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             the M_EXT_XINT macro.
1930c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
1940c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//              ------------------------------------------------------
1950c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
1960c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             |------------------------------------------------------|
1970c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             |    0   |       0     | 0123  | 4567  | 8xxx  | yyyy  |
1980c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             |    1   |      -1     | x012  | 3456  | 78xx  | yyyy  |
1990c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             |    2   |      -2     | xx01  | 2345  | 678x  | yyyy  |
2000c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             |    3   |      -3     | xxx0  | yyyy  | 1234  | 5678  |
2010c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//              ------------------------------------------------------
2020c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
2030c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Outputs:
2040c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $word0, $word1, $word2, $word3
2050c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             Bytes from the original source pointer (not truncated for
2060c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             4 byte alignment) as shown in the table.
2070c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//              -------------------------------
2080c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             | word0 | word1 | word2 | word3 |
2090c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             |-------------------------------|
2100c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             | 0123  | 4567  | 1234  | 5678  |
2110c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//              -------------------------------
2120c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
2130c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
2140c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// register numbering
2150c1bc742181ded4930842b46e9507372f0b1b963James Dong
2160c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
2170c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_EXT_XINT $offset, $word0, $word1, $word2, $word3
2180c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF $offset = 0
2190c1bc742181ded4930842b46e9507372f0b1b963James Dong            ; $word0 and $word1 are ok
2200c1bc742181ded4930842b46e9507372f0b1b963James Dong            ; $word2, $word3 are just 8 shifted versions
2210c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV         $word3, $word1, LSR #8
2220c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR         $word3, $word3, $word2, LSL #24
2230c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV         $word2, $word0, LSR #8
2240c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR         $word2, $word2, $word1, LSL #24
2250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ELIF $offset = 3
2260c1bc742181ded4930842b46e9507372f0b1b963James Dong            ; $word2 and $word3 are ok (taken care while loading itself)
2270c1bc742181ded4930842b46e9507372f0b1b963James Dong            ; set $word0 & $word1
2280c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV         $word0, $word0, LSR #24
2290c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR         $word0, $word0, $word2, LSL #8
2300c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV         $word1, $word2, LSR #24
2310c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR         $word1, $word1, $word3, LSL #8
2320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ELSE
2330c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV         $word0, $word0, LSR #8 * $offset
2340c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR         $word0, $word0, $word1, LSL #(32 - 8 * ($offset))
2350c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV         $word1, $word1, LSR #8 * $offset
2360c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR         $word1, $word1, $word2, LSL #(32 - 8 * ($offset))
2370c1bc742181ded4930842b46e9507372f0b1b963James Dong
2380c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV         $word3, $word1, LSR #8
2390c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR         $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1))
2400c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV         $word2, $word0, LSR #8
2410c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR         $word2, $word2, $word1, LSL #24
2420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
2430c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
2440c1bc742181ded4930842b46e9507372f0b1b963James Dong
2450c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
2460c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Description:
2470c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Computes half-sum and xor of two inputs and puts them in the input
2480c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// registers in that order
2490c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
2500c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Syntax:
2510c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_HSUM_XOR      $v0, $v1, $tmp
2520c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
2530c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Inputs:
2540c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $v0         a, first input
2550c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $v1         b, second input
2560c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $tmp        scratch register
2570c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
2580c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Outputs:
2590c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $v0         (a + b)/2
2600c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $v1         a ^ b
2610c1bc742181ded4930842b46e9507372f0b1b963James Dong
2620c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
2630c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_HSUM_XOR      $v0, $v1, $tmp
2640c1bc742181ded4930842b46e9507372f0b1b963James Dong        UHADD8      $tmp, $v0, $v1     ;// s0 = a + b
2650c1bc742181ded4930842b46e9507372f0b1b963James Dong        EOR         $v1, $v0, $v1      ;// l0 = a ^ b
2660c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         $v0, $tmp          ;// s0
2670c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
2680c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
2690c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Description:
2700c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in
2710c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// mcReconBlock module. Very specific to the implementation of
2720c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and
2730c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are
2740c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// not significant and are used by the callee for row counter (y)
2750c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
2760c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Some points to note are:
2770c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// 1. Input is pair of pair-averages and Xors
2780c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another
2790c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//    running average
2800c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// 3. Output is in the first argument
2810c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
2820c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Syntax:
2830c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_AVG4         $sum0, $lsb0, $sum1, $lsb1, $rndVal
2840c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
2850c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Inputs:
2860c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $sum0       (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged
2870c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $lsb0       (a ^ b)
2880c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $sum1       (c + d) >> 1. Not modified
2890c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $lsb1       (c ^ d)       Not modified
2900c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
2910c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
2920c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Outputs:
2930c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $sum0       (a + b + c + d + 1) / 4 : If no rounding
2940c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//             (a + b + c + d + 2) / 4 : If rounding
2950c1bc742181ded4930842b46e9507372f0b1b963James Dong
2960c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
2970c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_AVG4          $sum0, $lsb0, $sum1, $lsb1, $rndVal
2980c1bc742181ded4930842b46e9507372f0b1b963James Dong        LCLS OP1
2990c1bc742181ded4930842b46e9507372f0b1b963James Dong        LCLS OP2
3000c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF $rndVal = 0 ;// rounding case
3010c1bc742181ded4930842b46e9507372f0b1b963James DongOP1 SETS "AND"
3020c1bc742181ded4930842b46e9507372f0b1b963James DongOP2 SETS "ORR"
3030c1bc742181ded4930842b46e9507372f0b1b963James Dong        ELSE           ;// Not rounding case
3040c1bc742181ded4930842b46e9507372f0b1b963James DongOP1 SETS "ORR"
3050c1bc742181ded4930842b46e9507372f0b1b963James DongOP2 SETS "AND"
3060c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
3070c1bc742181ded4930842b46e9507372f0b1b963James Dong
3080c1bc742181ded4930842b46e9507372f0b1b963James Dong        LCLS lsb2
3090c1bc742181ded4930842b46e9507372f0b1b963James Dong        LCLS sum2
3100c1bc742181ded4930842b46e9507372f0b1b963James Dong        LCLS dest
3110c1bc742181ded4930842b46e9507372f0b1b963James Dong
3120c1bc742181ded4930842b46e9507372f0b1b963James Donglsb2  SETS "tmp"
3130c1bc742181ded4930842b46e9507372f0b1b963James Dongsum2  SETS "$lsb0"
3140c1bc742181ded4930842b46e9507372f0b1b963James Dongdest  SETS "$sum0"
3150c1bc742181ded4930842b46e9507372f0b1b963James Dong
3160c1bc742181ded4930842b46e9507372f0b1b963James Dong        $OP1        $lsb0, $lsb0, $lsb1          ;// e0 = e0 & e1
3170c1bc742181ded4930842b46e9507372f0b1b963James Dong        EOR         $lsb2, $sum0, $sum1          ;// e2 = s0 ^ s1
3180c1bc742181ded4930842b46e9507372f0b1b963James Dong        $OP2        $lsb2, $lsb2, $lsb0          ;// e2 = e2 | e0
3190c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND         $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask
3200c1bc742181ded4930842b46e9507372f0b1b963James Dong        UHADD8      $sum2, $sum0, $sum1          ;// s2 = (s0 + s1)/2
3210c1bc742181ded4930842b46e9507372f0b1b963James Dong        UADD8       $dest, $sum2, $lsb2          ;// dest =  s2 + e2
3220c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
3230c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
3240c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Motion compensation handler macros
3250c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
3260c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Description:
3270c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Implement motion compensation routines using the named registers in
3280c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// callee function. Each of the following 4 implement the 4 predict type
3290c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Each handles 8 cases each ie all the combinations of 4 types of source
3300c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// alignment offsets and 2 types of rounding flag
3310c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
3320c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Syntax:
3330c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
3340c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_MCRECONBLOCK_HalfPixelX   $rndVal, $offset
3350c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_MCRECONBLOCK_HalfPixelY   $rndVal, $offset
3360c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_MCRECONBLOCK_HalfPixelXY  $rndVal, $offset
3370c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
3380c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Inputs:
3390c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
3400c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $offset     $pSrc MOD 4 value. Offset from 4 byte aligned location.
3410c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
3420c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Outputs:
3430c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Outputs come in the named registers of the callee functions
3440c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// The macro loads the data from the source pointer, processes it and
3450c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// stores in the destination pointer. Does the whole prediction cycle
3460c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// of Motion Compensation routine for a particular predictType
3470c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// After this only residue addition to the predicted values remain
3480c1bc742181ded4930842b46e9507372f0b1b963James Dong
3490c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
3500c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
3510c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Algorithmic Description:
3520c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// This handles motion compensation for IntegerPixel predictType. Both
3530c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// rounding cases are handled by the same code base. It is just a copy
3540c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// from source to destination. Two lines are done per loop to reduce
3550c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// stalls. Loop has been software pipelined as well for that purpose.
3560c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
3570c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_LOAD_X loads a whole row in two registers and then they are stored
3580c1bc742181ded4930842b46e9507372f0b1b963James Dong
3590c1bc742181ded4930842b46e9507372f0b1b963James DongCaseIntegerPixelRnd0Offset$offset
3600c1bc742181ded4930842b46e9507372f0b1b963James DongCaseIntegerPixelRnd1Offset$offset
3610c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
3620c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
3630c1bc742181ded4930842b46e9507372f0b1b963James DongYloopIntegerPixelOffset$offset
3640c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUBS        y, y, #2
3650c1bc742181ded4930842b46e9507372f0b1b963James Dong    STRD        tmp1, tmp2, [pDst], dstStep
3660c1bc742181ded4930842b46e9507372f0b1b963James Dong    STRD        tmp3, tmp4, [pDst], dstStep
3670c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
3680c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
3690c1bc742181ded4930842b46e9507372f0b1b963James Dong    BGT         YloopIntegerPixelOffset$offset
3700c1bc742181ded4930842b46e9507372f0b1b963James Dong
3710c1bc742181ded4930842b46e9507372f0b1b963James Dong    B           SwitchPredictTypeEnd
3720c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
3730c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
3740c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
3750c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_MCRECONBLOCK_HalfPixelX $rndVal, $offset
3760c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Algorithmic Description:
3770c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// This handles motion compensation for HalfPixelX predictType. The two
3780c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// rounding cases are handled by the different code base and spanned by
3790c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// different macro calls. Loop has been software pipelined to reduce
3800c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// stalls.
3810c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
3820c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Filtering involves averaging a pixel with the next horizontal pixel.
3830c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with
3840c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// all pixels in a row with 4 pixel in each register and another 2
3850c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// registers with pixels corresponding to one horizontally shifted pixel
3860c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// corresponding to the initial row pixels. These are set of packed
3870c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// registers appropriate to do 4 lane SIMD.
3880c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// After that M_UHADD8R macro does the averaging taking care of the
3890c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// rounding as required
3900c1bc742181ded4930842b46e9507372f0b1b963James Dong
3910c1bc742181ded4930842b46e9507372f0b1b963James DongCaseHalfPixelXRnd$rndVal.Offset$offset
3920c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF $rndVal = 0
3930c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR mask, =0x80808080
3940c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF
3950c1bc742181ded4930842b46e9507372f0b1b963James Dong
3960c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
3970c1bc742181ded4930842b46e9507372f0b1b963James DongYloopHalfPixelXRnd$rndVal.Offset$offset
3980c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUBS        y, y, #1
3990c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_EXT_XINT  $offset, tmp1, tmp2, tmp3, tmp4
4000c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_UHADD8R   tmp5, tmp1, tmp3, (1-$rndVal), mask
4010c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_UHADD8R   tmp6, tmp2, tmp4, (1-$rndVal), mask
4020c1bc742181ded4930842b46e9507372f0b1b963James Dong    STRD        tmp5, tmp6, [pDst], dstStep
4030c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
4040c1bc742181ded4930842b46e9507372f0b1b963James Dong    BGT         YloopHalfPixelXRnd$rndVal.Offset$offset
4050c1bc742181ded4930842b46e9507372f0b1b963James Dong
4060c1bc742181ded4930842b46e9507372f0b1b963James Dong    B           SwitchPredictTypeEnd
4070c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
4080c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
4090c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
4100c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_MCRECONBLOCK_HalfPixelY $rndVal, $offset
4110c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Algorithmic Description:
4120c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// This handles motion compensation for HalfPixelY predictType. The two
4130c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// rounding cases are handled by the different code base and spanned by
4140c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// different macro calls. PreLoading is used to avoid reload of same data.
4150c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
4160c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Filtering involves averaging a pixel with the next vertical pixel.
4170c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in
4180c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// each register. These are set of packed registers appropriate to do
4190c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care
4200c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// of the rounding as required
4210c1bc742181ded4930842b46e9507372f0b1b963James Dong
4220c1bc742181ded4930842b46e9507372f0b1b963James DongCaseHalfPixelYRnd$rndVal.Offset$offset
4230c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF $rndVal = 0
4240c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR mask, =0x80808080
4250c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF
4260c1bc742181ded4930842b46e9507372f0b1b963James Dong
4270c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load
4280c1bc742181ded4930842b46e9507372f0b1b963James DongYloopHalfPixelYRnd$rndVal.Offset$offset
4290c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUBS        y, y, #2
4300c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Processing one line
4310c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
4320c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_UHADD8R   tmp1, tmp1, tmp3, (1-$rndVal), mask
4330c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_UHADD8R   tmp2, tmp2, tmp4, (1-$rndVal), mask
4340c1bc742181ded4930842b46e9507372f0b1b963James Dong    STRD        tmp1, tmp2, [pDst], dstStep
4350c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Processing another line
4360c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset
4370c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_UHADD8R   tmp3, tmp3, tmp1, (1-$rndVal), mask
4380c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_UHADD8R   tmp4, tmp4, tmp2, (1-$rndVal), mask
4390c1bc742181ded4930842b46e9507372f0b1b963James Dong    STRD        tmp3, tmp4, [pDst], dstStep
4400c1bc742181ded4930842b46e9507372f0b1b963James Dong
4410c1bc742181ded4930842b46e9507372f0b1b963James Dong    BGT         YloopHalfPixelYRnd$rndVal.Offset$offset
4420c1bc742181ded4930842b46e9507372f0b1b963James Dong
4430c1bc742181ded4930842b46e9507372f0b1b963James Dong    B           SwitchPredictTypeEnd
4440c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
4450c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
4460c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
4470c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset
4480c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Algorithmic Description:
4490c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// This handles motion compensation for HalfPixelXY predictType. The two
4500c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// rounding cases are handled by the different code base and spanned by
4510c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// different macro calls. PreLoading is used to avoid reload of same data.
4520c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
4530c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Filtering involves averaging a pixel with the next vertical, horizontal
4540c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT
4550c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// and M_EXT_XINT combination generates 4 registers with a row and its
4560c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// 1 pixel right shifted version, with 4 pixels in one register. Another
4570c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// call of that macro-combination gets another row. Then M_HSUM_XOR is
4580c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// called to get mutual half-sum and xor combinations of a row with its
4590c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// shifted version as they are inputs to the M_AVG4 macro which computes
4600c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// the 4 element average with rounding. Note that it is the half-sum/xor
4610c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// values that are preserved for next row as they can be re-used in the
4620c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// next call to the M_AVG4 and saves recomputation.
4630c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Due to lack of register, the row counter and a masking value required
4640c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// in M_AVG4 are packed into a single register yMask where the last nibble
4650c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// holds the row counter values and rest holds the masking variable left
4660c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// shifted by 4
4670c1bc742181ded4930842b46e9507372f0b1b963James Dong
4680c1bc742181ded4930842b46e9507372f0b1b963James DongCaseHalfPixelXYRnd$rndVal.Offset$offset
4690c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR         yMask, =((0x01010101 << 4) + 8)
4700c1bc742181ded4930842b46e9507372f0b1b963James Dong
4710c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
4720c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_EXT_XINT  $offset, t00, t01, t10, t11
4730c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_HSUM_XOR  t00, t10, tmp               ;// s0, l0
4740c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_HSUM_XOR  t01, t11, tmp               ;// s0', l0'
4750c1bc742181ded4930842b46e9507372f0b1b963James Dong
4760c1bc742181ded4930842b46e9507372f0b1b963James DongYloopHalfPixelXYRnd$rndVal.Offset$offset
4770c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Processsing one line
4780c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// t00, t01, t10, t11 required from previous loop
4790c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d'
4800c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB         yMask, yMask, #2
4810c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_EXT_XINT  $offset, t20, t21, t30, t31
4820c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_HSUM_XOR  t20, t30, tmp               ;// s1, l1
4830c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_HSUM_XOR  t21, t31, tmp               ;// s1', l1'
4840c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_AVG4      t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1
4850c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_AVG4      t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1'
4860c1bc742181ded4930842b46e9507372f0b1b963James Dong    STRD        t00, t01, [pDst], dstStep   ;// store the average
4870c1bc742181ded4930842b46e9507372f0b1b963James Dong
4880c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Processsing another line
4890c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// t20, t21, t30, t31 required from above
4900c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
4910c1bc742181ded4930842b46e9507372f0b1b963James Dong    TST         yMask, #7
4920c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_EXT_XINT  $offset, t00, t01, t10, t11
4930c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_HSUM_XOR  t00, t10, tmp
4940c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_HSUM_XOR  t01, t11, tmp
4950c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_AVG4      t20, t30, t00, t10, $rndVal
4960c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_AVG4      t21, t31, t01, t11, $rndVal
4970c1bc742181ded4930842b46e9507372f0b1b963James Dong    STRD        t20, t21, [pDst], dstStep
4980c1bc742181ded4930842b46e9507372f0b1b963James Dong
4990c1bc742181ded4930842b46e9507372f0b1b963James Dong    BGT         YloopHalfPixelXYRnd$rndVal.Offset$offset
5000c1bc742181ded4930842b46e9507372f0b1b963James Dong
5010c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF $offset/=3 :LOR: $rndVal/=1
5020c1bc742181ded4930842b46e9507372f0b1b963James Dong        B           SwitchPredictTypeEnd
5030c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF
5040c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
5050c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
5060c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Motion compensation handler macros end here
5070c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
5080c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Description:
5090c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal
5100c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// combination in the "switch" to prediction processing code segment
5110c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
5120c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Syntax:
5130c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_CASE_OFFSET $rnd, $predictType
5140c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
5150c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Inputs:
5160c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $rnd            0 for rounding, 1 for no rounding
5170c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $predictType    The prediction mode
5180c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
5190c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Outputs:
5200c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Populated list of "M_CASE"s for the "M_SWITCH" macro
5210c1bc742181ded4930842b46e9507372f0b1b963James Dong
5220c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
5230c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_CASE_OFFSET $rnd, $predictType
5240c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_CASE      Case$predictType.Rnd$rnd.Offset0
5250c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_CASE      Case$predictType.Rnd$rnd.Offset1
5260c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_CASE      Case$predictType.Rnd$rnd.Offset2
5270c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_CASE      Case$predictType.Rnd$rnd.Offset3
5280c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
5290c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
5300c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Description:
5310c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Populates all 2 kinds of rounding "cases" for each predictType in the
5320c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// "switch" to prediction processing code segment
5330c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
5340c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Syntax:
5350c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_CASE_OFFSET $predictType
5360c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
5370c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Inputs:
5380c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $predictType    The prediction mode
5390c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
5400c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Outputs:
5410c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Populated list of "M_CASE_OFFSET" macros
5420c1bc742181ded4930842b46e9507372f0b1b963James Dong
5430c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
5440c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_CASE_MCRECONBLOCK $predictType
5450c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_CASE_OFFSET  0, $predictType ;// 0 for rounding
5460c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_CASE_OFFSET  1, $predictType ;// 1 for no rounding
5470c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
5480c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
5490c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Description:
5500c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Populates all 8 kinds of rounding and offset combinations handling macros
5510c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// for the specified predictType. In case of "IntegerPixel" predictType,
5520c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// rounding is not required so same code segment handles both cases
5530c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
5540c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Syntax:
5550c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// M_MCRECONBLOCK    $predictType
5560c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
5570c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Inputs:
5580c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// $predictType    The prediction mode
5590c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//
5600c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Outputs:
5610c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified
5620c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// predictType. Each
5630c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;//                 M_MCRECONBLOCK_<predictType> $rnd, $offset
5640c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// is an code segment (starting with a label indicating the predictType,
5650c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// rounding and offset combination)
5660c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Four calls of this macro with the 4 prediction modes populate all the 32
5670c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// handlers
5680c1bc742181ded4930842b46e9507372f0b1b963James Dong
5690c1bc742181ded4930842b46e9507372f0b1b963James Dong    MACRO
5700c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_MCRECONBLOCK $predictType
5710c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_MCRECONBLOCK_$predictType 0, 0
5720c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_MCRECONBLOCK_$predictType 0, 1
5730c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_MCRECONBLOCK_$predictType 0, 2
5740c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_MCRECONBLOCK_$predictType 0, 3
5750c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference
5760c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_MCRECONBLOCK_$predictType 1, 0
5770c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_MCRECONBLOCK_$predictType 1, 1
5780c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_MCRECONBLOCK_$predictType 1, 2
5790c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_MCRECONBLOCK_$predictType 1, 3
5800c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF
5810c1bc742181ded4930842b46e9507372f0b1b963James Dong    MEND
5820c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
5830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Input/Output Registers
5840c1bc742181ded4930842b46e9507372f0b1b963James DongpSrc                  RN 0
5850c1bc742181ded4930842b46e9507372f0b1b963James DongsrcStep               RN 1
5860c1bc742181ded4930842b46e9507372f0b1b963James Dongarg_pSrcResidue       RN 2
5870c1bc742181ded4930842b46e9507372f0b1b963James DongpSrcResidue           RN 12
5880c1bc742181ded4930842b46e9507372f0b1b963James DongpDst                  RN 3
5890c1bc742181ded4930842b46e9507372f0b1b963James DongdstStep               RN 2
5900c1bc742181ded4930842b46e9507372f0b1b963James DongpredictType           RN 10
5910c1bc742181ded4930842b46e9507372f0b1b963James DongrndVal                RN 11
5920c1bc742181ded4930842b46e9507372f0b1b963James Dongmask                  RN 11
5930c1bc742181ded4930842b46e9507372f0b1b963James Dong
5940c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Local Scratch Registers
5950c1bc742181ded4930842b46e9507372f0b1b963James Dongzero                  RN 12
5960c1bc742181ded4930842b46e9507372f0b1b963James Dongy                     RN 14
5970c1bc742181ded4930842b46e9507372f0b1b963James Dong
5980c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp1                  RN 4
5990c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp2                  RN 5
6000c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp3                  RN 6
6010c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp4                  RN 7
6020c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp5                  RN 8
6030c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp6                  RN 9
6040c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp7                  RN 10
6050c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp8                  RN 11
6060c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp9                  RN 12
6070c1bc742181ded4930842b46e9507372f0b1b963James Dong
6080c1bc742181ded4930842b46e9507372f0b1b963James Dongt00                   RN 4
6090c1bc742181ded4930842b46e9507372f0b1b963James Dongt01                   RN 5
6100c1bc742181ded4930842b46e9507372f0b1b963James Dongt10                   RN 6
6110c1bc742181ded4930842b46e9507372f0b1b963James Dongt11                   RN 7
6120c1bc742181ded4930842b46e9507372f0b1b963James Dongt20                   RN 8
6130c1bc742181ded4930842b46e9507372f0b1b963James Dongt21                   RN 9
6140c1bc742181ded4930842b46e9507372f0b1b963James Dongt30                   RN 10
6150c1bc742181ded4930842b46e9507372f0b1b963James Dongt31                   RN 11
6160c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp                   RN 12
6170c1bc742181ded4930842b46e9507372f0b1b963James Dong
6180c1bc742181ded4930842b46e9507372f0b1b963James DongyMask                 RN 14
6190c1bc742181ded4930842b46e9507372f0b1b963James Dong
6200c1bc742181ded4930842b46e9507372f0b1b963James Dongdst                   RN 1
6210c1bc742181ded4930842b46e9507372f0b1b963James Dongreturn                RN 0
6220c1bc742181ded4930842b46e9507372f0b1b963James Dong
6230c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Allocate memory on stack
6240c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_ALLOC4    Stk_pDst,           4
6250c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_ALLOC4    Stk_pSrcResidue,    4
6260c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Function header
6270c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_START     omxVCM4P2_MCReconBlock, r11
6280c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Define stack arguments
6290c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_ARG       Arg_dstStep,        4
6300c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_ARG       Arg_predictType,    4
6310c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_ARG       Arg_rndVal,         4
6320c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Save on stack
6330c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_STR       pDst, Stk_pDst
6340c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_STR       arg_pSrcResidue, Stk_pSrcResidue
6350c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Load argument from the stack
6360c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LDR       dstStep, Arg_dstStep
6370c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LDR       predictType, Arg_predictType
6380c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LDR       rndVal, Arg_rndVal
6390c1bc742181ded4930842b46e9507372f0b1b963James Dong
6400c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV         y, #8
6410c1bc742181ded4930842b46e9507372f0b1b963James Dong
6420c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND         tmp1, pSrc, #3
6430c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR         predictType, tmp1, predictType, LSL #3
6440c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR         predictType, predictType, rndVal, LSL #2
6450c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Truncating source pointer to align to 4 byte location
6460c1bc742181ded4930842b46e9507372f0b1b963James Dong    BIC         pSrc, pSrc, #3
6470c1bc742181ded4930842b46e9507372f0b1b963James Dong
6480c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Implementation takes care of all combinations of different
6490c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// predictTypes, rounding cases and source pointer offsets to alignment
6500c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// of 4 bytes in different code bases unless one of these parameter wasn't
6510c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK
6520c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// macros branch into 8 M_CASE macros for all combinations of the 2
6530c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte
6540c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// alignment.
6550c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_SWITCH    predictType
6560c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_CASE_MCRECONBLOCK IntegerPixel
6570c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_CASE_MCRECONBLOCK HalfPixelX
6580c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_CASE_MCRECONBLOCK HalfPixelY
6590c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_CASE_MCRECONBLOCK HalfPixelXY
6600c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_ENDSWITCH
6610c1bc742181ded4930842b46e9507372f0b1b963James Dong
6620c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8
6630c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// particular macros (4 in case of IntegerPixel as rounding makes no
6640c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// difference there) to generate the code for all cases of rounding and
6650c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// offsets. LTORG is used to segment the code as code size bloated beyond
6660c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// 4KB.
6670c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_MCRECONBLOCK IntegerPixel
6680c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_MCRECONBLOCK HalfPixelX
6690c1bc742181ded4930842b46e9507372f0b1b963James Dong    LTORG
6700c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_MCRECONBLOCK HalfPixelY
6710c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_MCRECONBLOCK HalfPixelXY
6720c1bc742181ded4930842b46e9507372f0b1b963James DongSwitchPredictTypeEnd
6730c1bc742181ded4930842b46e9507372f0b1b963James Dong
6740c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// Residue Addition
6750c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// This is done in 2 lane SIMD though loads are further optimized and
6760c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// 4 bytes are loaded in case of destination buffer. Algorithmic
6770c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// details are in inlined comments
6780c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LDR       pSrcResidue, Stk_pSrcResidue
6790c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP         pSrcResidue, #0
6800c1bc742181ded4930842b46e9507372f0b1b963James Dong    BEQ         pSrcResidueConditionEnd
6810c1bc742181ded4930842b46e9507372f0b1b963James DongpSrcResidueNotNull
6820c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_LDR       pDst, Stk_pDst
6830c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV         y, #8
6840c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB         dstStep, dstStep, #4
6850c1bc742181ded4930842b46e9507372f0b1b963James DongYloop_pSrcResidueNotNull
6860c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUBS        y, y, #1
6870c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR         dst, [pDst]                ;// dst = [dcba]
6880c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDMIA       pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA]
6890c1bc742181ded4930842b46e9507372f0b1b963James Dong    PKHBT       tmp3, tmp1, tmp2, LSL #16  ;// Deltaval1 = [C A]
6900c1bc742181ded4930842b46e9507372f0b1b963James Dong    PKHTB       tmp4, tmp2, tmp1, ASR #16  ;// DeltaVal2 = [D B]
6910c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16      tmp1, dst                  ;// tmp1 = [0c0a]
6920c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16      tmp2, dst, ROR #8          ;// tmp2 = [0d0b]
6930c1bc742181ded4930842b46e9507372f0b1b963James Dong    QADD16      tmp1, tmp1, tmp3           ;// Add and saturate to 16 bits
6940c1bc742181ded4930842b46e9507372f0b1b963James Dong    QADD16      tmp2, tmp2, tmp4
6950c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16      tmp1, #8, tmp1
6960c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16      tmp2, #8, tmp2             ;// armClip(0, 255, tmp2)
6970c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR         tmp1, tmp1, tmp2, LSL #8   ;// tmp1 = [dcba]
6980c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR         tmp1, [pDst], #4
6990c1bc742181ded4930842b46e9507372f0b1b963James Dong
7000c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR         dst, [pDst]
7010c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDMIA       pSrcResidue!, {tmp1, tmp2}
7020c1bc742181ded4930842b46e9507372f0b1b963James Dong    PKHBT       tmp3, tmp1, tmp2, LSL #16
7030c1bc742181ded4930842b46e9507372f0b1b963James Dong    PKHTB       tmp4, tmp2, tmp1, ASR #16
7040c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16      tmp1, dst
7050c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16      tmp2, dst, ROR #8
7060c1bc742181ded4930842b46e9507372f0b1b963James Dong    QADD16      tmp1, tmp1, tmp3
7070c1bc742181ded4930842b46e9507372f0b1b963James Dong    QADD16      tmp2, tmp2, tmp4
7080c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16      tmp1, #8, tmp1
7090c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16      tmp2, #8, tmp2
7100c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR         tmp1, tmp1, tmp2, LSL #8
7110c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR         tmp1, [pDst], dstStep
7120c1bc742181ded4930842b46e9507372f0b1b963James Dong
7130c1bc742181ded4930842b46e9507372f0b1b963James Dong    BGT         Yloop_pSrcResidueNotNull
7140c1bc742181ded4930842b46e9507372f0b1b963James DongpSrcResidueConditionEnd
7150c1bc742181ded4930842b46e9507372f0b1b963James Dong
7160c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV         return, #OMX_Sts_NoErr
7170c1bc742181ded4930842b46e9507372f0b1b963James Dong
7180c1bc742181ded4930842b46e9507372f0b1b963James Dong    M_END
7190c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF ;// ARM1136JS
7200c1bc742181ded4930842b46e9507372f0b1b963James Dong
7210c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
7220c1bc742181ded4930842b46e9507372f0b1b963James Dong;// CortexA8 implementation
7230c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
7240c1bc742181ded4930842b46e9507372f0b1b963James Dong    END
7250c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
7260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// omxVCM4P2_MCReconBlock ends
7270c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ***************************************************************************
728