10c1bc742181ded4930842b46e9507372f0b1b963James Dong; Copyright (C) 2009 The Android Open Source Project
20c1bc742181ded4930842b46e9507372f0b1b963James Dong;
30c1bc742181ded4930842b46e9507372f0b1b963James Dong; Licensed under the Apache License, Version 2.0 (the "License");
40c1bc742181ded4930842b46e9507372f0b1b963James Dong; you may not use this file except in compliance with the License.
50c1bc742181ded4930842b46e9507372f0b1b963James Dong; You may obtain a copy of the License at
60c1bc742181ded4930842b46e9507372f0b1b963James Dong;
70c1bc742181ded4930842b46e9507372f0b1b963James Dong;      http://www.apache.org/licenses/LICENSE-2.0
80c1bc742181ded4930842b46e9507372f0b1b963James Dong;
90c1bc742181ded4930842b46e9507372f0b1b963James Dong; Unless required by applicable law or agreed to in writing, software
100c1bc742181ded4930842b46e9507372f0b1b963James Dong; distributed under the License is distributed on an "AS IS" BASIS,
110c1bc742181ded4930842b46e9507372f0b1b963James Dong; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
120c1bc742181ded4930842b46e9507372f0b1b963James Dong; See the License for the specific language governing permissions and
130c1bc742181ded4930842b46e9507372f0b1b963James Dong; limitations under the License.
140c1bc742181ded4930842b46e9507372f0b1b963James Dong
150c1bc742181ded4930842b46e9507372f0b1b963James Dong;-------------------------------------------------------------------------------
160c1bc742181ded4930842b46e9507372f0b1b963James Dong;--
170c1bc742181ded4930842b46e9507372f0b1b963James Dong;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorVerQuarter
180c1bc742181ded4930842b46e9507372f0b1b963James Dong;--            function
190c1bc742181ded4930842b46e9507372f0b1b963James Dong;--
200c1bc742181ded4930842b46e9507372f0b1b963James Dong;-------------------------------------------------------------------------------
210c1bc742181ded4930842b46e9507372f0b1b963James Dong
220c1bc742181ded4930842b46e9507372f0b1b963James Dong
230c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF :DEF: H264DEC_WINASM
240c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
250c1bc742181ded4930842b46e9507372f0b1b963James Dong    ELSE
260c1bc742181ded4930842b46e9507372f0b1b963James Dong        REQUIRE8
270c1bc742181ded4930842b46e9507372f0b1b963James Dong        PRESERVE8
280c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF
290c1bc742181ded4930842b46e9507372f0b1b963James Dong
300c1bc742181ded4930842b46e9507372f0b1b963James Dong    AREA    |.text|, CODE
310c1bc742181ded4930842b46e9507372f0b1b963James Dong
320c1bc742181ded4930842b46e9507372f0b1b963James Dong;// h264bsdInterpolateHorVerQuarter register allocation
330c1bc742181ded4930842b46e9507372f0b1b963James Dong
340c1bc742181ded4930842b46e9507372f0b1b963James Dongref     RN 0
350c1bc742181ded4930842b46e9507372f0b1b963James Dong
360c1bc742181ded4930842b46e9507372f0b1b963James Dongmb      RN 1
370c1bc742181ded4930842b46e9507372f0b1b963James Dongbuff    RN 1
380c1bc742181ded4930842b46e9507372f0b1b963James Dong
390c1bc742181ded4930842b46e9507372f0b1b963James Dongcount   RN 2
400c1bc742181ded4930842b46e9507372f0b1b963James Dongx0      RN 2
410c1bc742181ded4930842b46e9507372f0b1b963James Dong
420c1bc742181ded4930842b46e9507372f0b1b963James Dongy0      RN 3
430c1bc742181ded4930842b46e9507372f0b1b963James Dongx_2_0   RN 3
440c1bc742181ded4930842b46e9507372f0b1b963James Dongres     RN 3
450c1bc742181ded4930842b46e9507372f0b1b963James Dong
460c1bc742181ded4930842b46e9507372f0b1b963James Dongx_3_1   RN 4
470c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp1    RN 4
480c1bc742181ded4930842b46e9507372f0b1b963James Dong
490c1bc742181ded4930842b46e9507372f0b1b963James Dongheight  RN 5
500c1bc742181ded4930842b46e9507372f0b1b963James Dongx_6_4   RN 5
510c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp2    RN 5
520c1bc742181ded4930842b46e9507372f0b1b963James Dong
530c1bc742181ded4930842b46e9507372f0b1b963James DongpartW   RN 6
540c1bc742181ded4930842b46e9507372f0b1b963James Dongx_7_5   RN 6
550c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp3    RN 6
560c1bc742181ded4930842b46e9507372f0b1b963James Dong
570c1bc742181ded4930842b46e9507372f0b1b963James DongpartH   RN 7
580c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp4    RN 7
590c1bc742181ded4930842b46e9507372f0b1b963James Dong
600c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp5    RN 8
610c1bc742181ded4930842b46e9507372f0b1b963James Dong
620c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp6    RN 9
630c1bc742181ded4930842b46e9507372f0b1b963James Dong
640c1bc742181ded4930842b46e9507372f0b1b963James Dongtmpa    RN 10
650c1bc742181ded4930842b46e9507372f0b1b963James Dong
660c1bc742181ded4930842b46e9507372f0b1b963James Dongmult_20_01  RN 11
670c1bc742181ded4930842b46e9507372f0b1b963James Dongtmpb        RN 11
680c1bc742181ded4930842b46e9507372f0b1b963James Dong
690c1bc742181ded4930842b46e9507372f0b1b963James Dongmult_20_m5  RN 12
700c1bc742181ded4930842b46e9507372f0b1b963James Dongwidth       RN 12
710c1bc742181ded4930842b46e9507372f0b1b963James Dong
720c1bc742181ded4930842b46e9507372f0b1b963James Dongplus16  RN 14
730c1bc742181ded4930842b46e9507372f0b1b963James Dong
740c1bc742181ded4930842b46e9507372f0b1b963James Dong
750c1bc742181ded4930842b46e9507372f0b1b963James Dong;// function exports and imports
760c1bc742181ded4930842b46e9507372f0b1b963James Dong
770c1bc742181ded4930842b46e9507372f0b1b963James Dong    IMPORT  h264bsdFillBlock
780c1bc742181ded4930842b46e9507372f0b1b963James Dong
790c1bc742181ded4930842b46e9507372f0b1b963James Dong    EXPORT  h264bsdInterpolateHorVerQuarter
800c1bc742181ded4930842b46e9507372f0b1b963James Dong
810c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Horizontal filter approach
820c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Basic idea in horizontal filtering is to adjust coefficients
840c1bc742181ded4930842b46e9507372f0b1b963James Dong;// like below. Calculation is done with 16-bit maths.
850c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
860c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
870c1bc742181ded4930842b46e9507372f0b1b963James Dong;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
880c1bc742181ded4930842b46e9507372f0b1b963James Dong;// y_0 =   20  1     20 -5        -5         1
890c1bc742181ded4930842b46e9507372f0b1b963James Dong;// y_1 =   -5        20  1      1 20        -5
900c1bc742181ded4930842b46e9507372f0b1b963James Dong;// y_2 =    1        -5        -5 20      1 20
910c1bc742181ded4930842b46e9507372f0b1b963James Dong;// y_3 =              1        20 -5     -5 20         1
920c1bc742181ded4930842b46e9507372f0b1b963James Dong
930c1bc742181ded4930842b46e9507372f0b1b963James Dong
940c1bc742181ded4930842b46e9507372f0b1b963James Dongh264bsdInterpolateHorVerQuarter
950c1bc742181ded4930842b46e9507372f0b1b963James Dong    STMFD   sp!, {r0-r11, lr}
960c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB     sp, sp, #0x1e4
970c1bc742181ded4930842b46e9507372f0b1b963James Dong
980c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP     x0, #0
990c1bc742181ded4930842b46e9507372f0b1b963James Dong    BLT     do_fill                 ;// (x0 < 0)
1000c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     partW, [sp,#0x220]      ;// partWidth
1010c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     width, [sp,#0x218]      ;// width
1020c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, x0, partW         ;// (x0+partWidth)
1030c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, #5          ;// (x0+partW+5)
1040c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP     tmpa, width
1050c1bc742181ded4930842b46e9507372f0b1b963James Dong    BHI     do_fill                 ;// (x0+partW)>width
1060c1bc742181ded4930842b46e9507372f0b1b963James Dong
1070c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP     y0, #0
1080c1bc742181ded4930842b46e9507372f0b1b963James Dong    BLT     do_fill                 ;// (y0 < 0)
1090c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     partH, [sp,#0x224]      ;// partHeight
1100c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     height, [sp,#0x21c]     ;// height
1110c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp5, y0, partH         ;// (y0+partHeight)
1120c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp5, tmp5, #5          ;// (y0+partH+5)
1130c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP     tmp5, height
1140c1bc742181ded4930842b46e9507372f0b1b963James Dong    BLS     skip_fill               ;// no overfill needed
1150c1bc742181ded4930842b46e9507372f0b1b963James Dong
1160c1bc742181ded4930842b46e9507372f0b1b963James Dong
1170c1bc742181ded4930842b46e9507372f0b1b963James Dongdo_fill
1180c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     partH, [sp,#0x224]      ;// partHeight
1190c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     partW, [sp,#0x220]      ;// partWidth
1200c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     height, [sp,#0x21c]     ;// height
1210c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp5, partH, #5         ;// tmp5 = partH + 5
1220c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, partW, #5         ;// tmpa = partW + 5
1230c1bc742181ded4930842b46e9507372f0b1b963James Dong    STMIB   sp, {height, tmpa}      ;// sp+4 = height, sp+8 = partWidth+5
1240c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     width, [sp,#0x218]      ;// width
1250c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     tmp5, [sp,#0xc]         ;// sp+c = partHeight+5
1260c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     tmpa, [sp,#0x10]        ;// sp+10 = partWidth+5
1270c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     width, [sp,#0]          ;// sp+0 = width
1280c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
1290c1bc742181ded4930842b46e9507372f0b1b963James Dong    BL      h264bsdFillBlock
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     x0, #0
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     x0,[sp,#0x1ec]          ;// x0 = 0
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     x0,[sp,#0x1f0]          ;// y0 = 0
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref,sp,#0x28            ;// ref = p1
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     tmpa, [sp,#0x218]       ;// width = partWidth+5
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong
1380c1bc742181ded4930842b46e9507372f0b1b963James Dongskip_fill
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     x0 ,[sp,#0x1ec]         ;// x0
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     y0 ,[sp,#0x1f0]         ;// y0
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     width, [sp,#0x218]      ;// width
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp6, [sp,#0x228]       ;// horVerOffset
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     mb, [sp, #0x1e8]        ;// mb
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong    MLA     tmp5, width, y0, x0     ;// y0*width+x0
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref, ref, tmp5          ;// ref += y0*width+x0
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     ref, [sp, #0x1e4]       ;// store "ref" for vertical filtering
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmp6, tmp6, #2          ;// calculate ref for horizontal filter
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     tmpa, #2
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp6, tmpa, tmp6, LSR #1
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong    MLA     ref, tmp6, width, ref
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref, ref, #8            ;// ref = ref+8
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// pack values to count register
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// [31:28] loop_x (partWidth-1)
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// [27:24] loop_y (partHeight-1)
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// [23:20] partWidth-1
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// [19:16] partHeight-1
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// [15:00] width
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     count, width
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB     partW, partW, #1;
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB     partH, partH, #1;
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp5, partH, partW, LSL #4
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     count, count, tmp5, LSL #16
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong
1650c1bc742181ded4930842b46e9507372f0b1b963James Dong
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     mult_20_01, = 0x00140001    ;// constant multipliers
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     mult_20_m5, = 0x0014FFFB    ;// constant multipliers
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     plus16, #16                 ;// constant for add
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmp4, count, #0x000F0000    ;// partHeight-1
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmp6, count, #0x00F00000    ;// partWidth-1
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     count, count, tmp4, LSL #8  ;// partH-1 to lower part of top byte
1720c1bc742181ded4930842b46e9507372f0b1b963James Dong
1730c1bc742181ded4930842b46e9507372f0b1b963James Dong;// HORIZONTAL PART
1740c1bc742181ded4930842b46e9507372f0b1b963James Dong
1750c1bc742181ded4930842b46e9507372f0b1b963James Dongloop_y_hor
1760c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     x_3_1, [ref, #-8]
1770c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     count, count, tmp6, LSL #8   ;// partW-1 to upper part of top byte
1780c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     x_7_5, [ref, #-4]
1790c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_2_0, x_3_1
1800c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_3_1, x_3_1, ROR #8
1810c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_6_4, x_7_5
1820c1bc742181ded4930842b46e9507372f0b1b963James Dong
1830c1bc742181ded4930842b46e9507372f0b1b963James Dongloop_x_hor
1840c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_7_5, x_7_5, ROR #8
1850c1bc742181ded4930842b46e9507372f0b1b963James Dong
1860c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp4, x_2_0, mult_20_01, plus16
1870c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp6, x_2_0, mult_20_01, plus16
1880c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp5, x_2_0, mult_20_m5, plus16
1890c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmpa, x_3_1, mult_20_01, plus16
1900c1bc742181ded4930842b46e9507372f0b1b963James Dong
1910c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp4, x_3_1, mult_20_m5, tmp4
1920c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp6, x_3_1, mult_20_m5, tmp6
1930c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp5, x_3_1, mult_20_01, tmp5
1940c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     x_3_1, [ref], #4
1950c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmpa, x_6_4, mult_20_m5, tmpa
1960c1bc742181ded4930842b46e9507372f0b1b963James Dong
1970c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp4, x_6_4, mult_20_m5, tmp4
1980c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp6, x_6_4, mult_20_m5, tmp6
1990c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp5, x_6_4, mult_20_01, tmp5
2000c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmpa, x_7_5, mult_20_m5, tmpa
2010c1bc742181ded4930842b46e9507372f0b1b963James Dong
2020c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp4, x_7_5, mult_20_01, tmp4
2030c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_2_0, x_3_1
2040c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp5, x_7_5, mult_20_m5, tmp5
2050c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp6, x_7_5, mult_20_01, tmp6
2060c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmpa, x_2_0, mult_20_01, tmpa
2070c1bc742181ded4930842b46e9507372f0b1b963James Dong
2080c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     tmp5, tmp5, ASR #5
2090c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     tmp4, tmp4, ASR #5
2100c1bc742181ded4930842b46e9507372f0b1b963James Dong    PKHBT   tmp5, tmp5, tmpa, LSL #(16-5)
2110c1bc742181ded4930842b46e9507372f0b1b963James Dong    PKHBT   tmp4, tmp4, tmp6, LSL #(16-5)
2120c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmp5, #8, tmp5
2130c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmp4, #8, tmp4
2140c1bc742181ded4930842b46e9507372f0b1b963James Dong
2150c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUBS    count, count, #4<<28
2160c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR     tmp4, tmp4, tmp5, LSL #8
2170c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     tmp4, [mb], #4
2180c1bc742181ded4930842b46e9507372f0b1b963James Dong    BCC     next_y_hor
2190c1bc742181ded4930842b46e9507372f0b1b963James Dong
2200c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_3_1, x_3_1, ROR #8
2210c1bc742181ded4930842b46e9507372f0b1b963James Dong
2220c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp4, x_6_4, mult_20_01, plus16
2230c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp6, x_6_4, mult_20_01, plus16
2240c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp5, x_6_4, mult_20_m5, plus16
2250c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmpa, x_7_5, mult_20_01, plus16
2260c1bc742181ded4930842b46e9507372f0b1b963James Dong
2270c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp4, x_7_5, mult_20_m5, tmp4
2280c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp6, x_7_5, mult_20_m5, tmp6
2290c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp5, x_7_5, mult_20_01, tmp5
2300c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     x_7_5, [ref], #4
2310c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmpa, x_2_0, mult_20_m5, tmpa
2320c1bc742181ded4930842b46e9507372f0b1b963James Dong
2330c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp4, x_2_0, mult_20_m5, tmp4
2340c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp6, x_2_0, mult_20_m5, tmp6
2350c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp5, x_2_0, mult_20_01, tmp5
2360c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmpa, x_3_1, mult_20_m5, tmpa
2370c1bc742181ded4930842b46e9507372f0b1b963James Dong
2380c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp4, x_3_1, mult_20_01, tmp4
2390c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_6_4, x_7_5
2400c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp5, x_3_1, mult_20_m5, tmp5
2410c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp6, x_3_1, mult_20_01, tmp6
2420c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmpa, x_6_4, mult_20_01, tmpa
2430c1bc742181ded4930842b46e9507372f0b1b963James Dong
2440c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     tmp5, tmp5, ASR #5
2450c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     tmp4, tmp4, ASR #5
2460c1bc742181ded4930842b46e9507372f0b1b963James Dong    PKHBT   tmp5, tmp5, tmpa, LSL #(16-5)
2470c1bc742181ded4930842b46e9507372f0b1b963James Dong    PKHBT   tmp4, tmp4, tmp6, LSL #(16-5)
2480c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmp5, #8, tmp5
2490c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmp4, #8, tmp4
2500c1bc742181ded4930842b46e9507372f0b1b963James Dong
2510c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUBS    count, count, #4<<28
2520c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR     tmp4, tmp4, tmp5, LSL #8
2530c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     tmp4, [mb], #4
2540c1bc742181ded4930842b46e9507372f0b1b963James Dong    BCS     loop_x_hor
2550c1bc742181ded4930842b46e9507372f0b1b963James Dong
2560c1bc742181ded4930842b46e9507372f0b1b963James Dongnext_y_hor
2570c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmp6, count, #0x00F00000        ;// partWidth-1
2580c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  ref, count, mult_20_01, ref     ;// +width
2590c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADDS    mb, mb, #16                     ;// +16, Carry=0
2600c1bc742181ded4930842b46e9507372f0b1b963James Dong    SBC     mb, mb, tmp6, LSR #20           ;// -(partWidth-1)-1
2610c1bc742181ded4930842b46e9507372f0b1b963James Dong    SBC     ref, ref, tmp6, LSR #20         ;// -(partWidth-1)-1
2620c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADDS    count, count, #(1<<28)-(1<<24)  ;// decrement counter (partW)
2630c1bc742181ded4930842b46e9507372f0b1b963James Dong    BGE     loop_y_hor
2640c1bc742181ded4930842b46e9507372f0b1b963James Dong
2650c1bc742181ded4930842b46e9507372f0b1b963James Dong
2660c1bc742181ded4930842b46e9507372f0b1b963James Dong
2670c1bc742181ded4930842b46e9507372f0b1b963James Dong;// VERTICAL PART
2680c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
2690c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Approach to vertical interpolation
2700c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
2710c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Interpolation is done by using 32-bit loads and stores
2720c1bc742181ded4930842b46e9507372f0b1b963James Dong;// and by using 16 bit arithmetic. 4x4 block is processed
2730c1bc742181ded4930842b46e9507372f0b1b963James Dong;// in each round.
2740c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
2750c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
2760c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
2770c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
2780c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
2790c1bc742181ded4930842b46e9507372f0b1b963James Dong;//           ..
2800c1bc742181ded4930842b46e9507372f0b1b963James Dong;//           ..
2810c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |a_m1|a_m1|a_m1|a_m1|...
2820c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |b_m1|b_m1|b_m1|b_m1|...
2830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |c_m1|c_m1|c_m1|c_m1|...
2840c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |d_m1|d_m1|d_m1|d_m1|...
2850c1bc742181ded4930842b46e9507372f0b1b963James Dong
2860c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Approach to bilinear interpolation to quarter pel position.
2870c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 4 bytes are processed parallel
2880c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
2890c1bc742181ded4930842b46e9507372f0b1b963James Dong;// algorithm (a+b+1)/2. Rouding upwards +1 can be achieved by
2900c1bc742181ded4930842b46e9507372f0b1b963James Dong;// negating second operand to get one's complement (instead of 2's)
2910c1bc742181ded4930842b46e9507372f0b1b963James Dong;// and using subtraction, EOR is used to correct sign.
2920c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
2930c1bc742181ded4930842b46e9507372f0b1b963James Dong;// MVN     b, b
2940c1bc742181ded4930842b46e9507372f0b1b963James Dong;// UHSUB8  a, a, b
2950c1bc742181ded4930842b46e9507372f0b1b963James Dong;// EOR     a, a, 0x80808080
2960c1bc742181ded4930842b46e9507372f0b1b963James Dong
2970c1bc742181ded4930842b46e9507372f0b1b963James Dong
2980c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     ref, [sp, #0x1e4]           ;// ref
2990c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, [sp, #0x228]          ;// horVerOffset
3000c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     mb, [sp, #0x1e8]            ;// mb
3010c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     width, [sp, #0x218]         ;// width
3020c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref, ref, #2                ;// calculate correct position
3030c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmpa, tmpa, #1
3040c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref, ref, tmpa
3050c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     plus16, = 0x00100010        ;// +16 to lower and upperf halfwords
3060c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     count, count, #0x00FFFFFF   ;// partWidth-1
3070c1bc742181ded4930842b46e9507372f0b1b963James Dong
3080c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmpa, count, #0x000F0000    ;// partHeight-1
3090c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     count, count, tmpa, LSL #8
3100c1bc742181ded4930842b46e9507372f0b1b963James Dong
3110c1bc742181ded4930842b46e9507372f0b1b963James Dongloop_y
3120c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     count, count, tmp6, LSL #8  ;// partWidth-1
3130c1bc742181ded4930842b46e9507372f0b1b963James Dong
3140c1bc742181ded4930842b46e9507372f0b1b963James Dongloop_x
3150c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp1, [ref], width     ;// |a4|a3|a2|a1|
3160c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp2, [ref], width     ;// |c4|c3|c2|c1|
3170c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp3, [ref], width     ;// |g4|g3|g2|g1|
3180c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp4, [ref], width     ;// |m4|m3|m2|m1|
3190c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp5, [ref], width     ;// |r4|r3|r2|r1|
3200c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp6, [ref], width     ;// |t4|t3|t2|t1|
3210c1bc742181ded4930842b46e9507372f0b1b963James Dong
3220c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// first four pixels
3230c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp3                  ;// |g3|g1|
3240c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp4            ;// |g3+m3|g1+m1|
3250c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp2                  ;// |c3|c1|
3260c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
3270c1bc742181ded4930842b46e9507372f0b1b963James Dong
3280c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp5            ;// |c3+r3|c1+r1|
3290c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
3300c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A
3310c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp6            ;// 16+20(G+M)+A+T
3320c1bc742181ded4930842b46e9507372f0b1b963James Dong
3330c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
3340c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
3350c1bc742181ded4930842b46e9507372f0b1b963James Dong
3360c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
3370c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     res, = 0x00FF00FF
3380c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp3, ROR #8          ;// |g4|g2|
3390c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// |g4+m4|g2+m2|
3400c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
3410c1bc742181ded4930842b46e9507372f0b1b963James Dong
3420c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
3430c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp2, ROR #8          ;// |c4|c2|
3440c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
3450c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp5, ROR #8    ;// |c4+r4|c2+r2|
3460c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A
3470c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// 16+20(G+M)+A+T
3480c1bc742181ded4930842b46e9507372f0b1b963James Dong
3490c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
3500c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
3510c1bc742181ded4930842b46e9507372f0b1b963James Dong
3520c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
3530c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp1, [mb]
3540c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0xFF00FF00
3550c1bc742181ded4930842b46e9507372f0b1b963James Dong    MVN     tmp1, tmp1
3560c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divede by 32
3570c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR     res, res, tmpa
3580c1bc742181ded4930842b46e9507372f0b1b963James Dong
3590c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0x80808080
3600c1bc742181ded4930842b46e9507372f0b1b963James Dong    UHSUB8  res, res, tmp1              ;// bilinear interpolation
3610c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp1, [ref], width          ;// load next row
3620c1bc742181ded4930842b46e9507372f0b1b963James Dong    EOR     res, res, tmpa              ;// correct sign
3630c1bc742181ded4930842b46e9507372f0b1b963James Dong
3640c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     res, [mb], #16              ;// next row (mb)
3650c1bc742181ded4930842b46e9507372f0b1b963James Dong
3660c1bc742181ded4930842b46e9507372f0b1b963James Dong
3670c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp2 = |a4|a3|a2|a1|
3680c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp3 = |c4|c3|c2|c1|
3690c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp4 = |g4|g3|g2|g1|
3700c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp5 = |m4|m3|m2|m1|
3710c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp6 = |r4|r3|r2|r1|
3720c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp1 = |t4|t3|t2|t1|
3730c1bc742181ded4930842b46e9507372f0b1b963James Dong
3740c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// second four pixels
3750c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp4                  ;// |g3|g1|
3760c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp5            ;// |g3+m3|g1+m1|
3770c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp3                  ;// |c3|c1|
3780c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
3790c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp6            ;// |c3+r3|c1+r1|
3800c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
3810c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A
3820c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A+T
3830c1bc742181ded4930842b46e9507372f0b1b963James Dong
3840c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
3850c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
3860c1bc742181ded4930842b46e9507372f0b1b963James Dong
3870c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
3880c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     res, = 0x00FF00FF
3890c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp4, ROR #8          ;// |g4|g2|
3900c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp5, ROR #8    ;// |g4+m4|g2+m2|
3910c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
3920c1bc742181ded4930842b46e9507372f0b1b963James Dong
3930c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
3940c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp3, ROR #8          ;// |c4|c2|
3950c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
3960c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp6, ROR #8    ;// |c4+r4|c2+r2|
3970c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A
3980c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A+T
3990c1bc742181ded4930842b46e9507372f0b1b963James Dong
4000c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
4010c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
4020c1bc742181ded4930842b46e9507372f0b1b963James Dong
4030c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
4040c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp2, [mb]
4050c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0xFF00FF00
4060c1bc742181ded4930842b46e9507372f0b1b963James Dong    MVN     tmp2, tmp2
4070c1bc742181ded4930842b46e9507372f0b1b963James Dong
4080c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
4090c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR     res, res, tmpa
4100c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0x80808080
4110c1bc742181ded4930842b46e9507372f0b1b963James Dong    UHSUB8  res, res, tmp2              ;// bilinear interpolation
4120c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp2, [ref], width          ;// load next row
4130c1bc742181ded4930842b46e9507372f0b1b963James Dong    EOR     res, res, tmpa              ;// correct sign
4140c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     res, [mb], #16              ;// next row
4150c1bc742181ded4930842b46e9507372f0b1b963James Dong
4160c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp3 = |a4|a3|a2|a1|
4170c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp4 = |c4|c3|c2|c1|
4180c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp5 = |g4|g3|g2|g1|
4190c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp6 = |m4|m3|m2|m1|
4200c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp1 = |r4|r3|r2|r1|
4210c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp2 = |t4|t3|t2|t1|
4220c1bc742181ded4930842b46e9507372f0b1b963James Dong
4230c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// third four pixels
4240c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp5                  ;// |g3|g1|
4250c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp6            ;// |g3+m3|g1+m1|
4260c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp4                  ;// |c3|c1|
4270c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
4280c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp1            ;// |c3+r3|c1+r1|
4290c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
4300c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A
4310c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A+T
4320c1bc742181ded4930842b46e9507372f0b1b963James Dong
4330c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
4340c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
4350c1bc742181ded4930842b46e9507372f0b1b963James Dong
4360c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
4370c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     res, = 0x00FF00FF
4380c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp5, ROR #8          ;// |g4|g2|
4390c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// |g4+m4|g2+m2|
4400c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
4410c1bc742181ded4930842b46e9507372f0b1b963James Dong
4420c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
4430c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp4, ROR #8          ;// |c4|c2|
4440c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
4450c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp1, ROR #8    ;// |c4+r4|c2+r2|
4460c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A
4470c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A+T
4480c1bc742181ded4930842b46e9507372f0b1b963James Dong
4490c1bc742181ded4930842b46e9507372f0b1b963James Dong
4500c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
4510c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
4520c1bc742181ded4930842b46e9507372f0b1b963James Dong
4530c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
4540c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp3, [mb]
4550c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0xFF00FF00
4560c1bc742181ded4930842b46e9507372f0b1b963James Dong    MVN     tmp3, tmp3
4570c1bc742181ded4930842b46e9507372f0b1b963James Dong
4580c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
4590c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR     res, res, tmpa
4600c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0x80808080
4610c1bc742181ded4930842b46e9507372f0b1b963James Dong    UHSUB8  res, res, tmp3              ;// bilinear interpolation
4620c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp3, [ref]                 ;// load next row
4630c1bc742181ded4930842b46e9507372f0b1b963James Dong    EOR     res, res, tmpa              ;// correct sign
4640c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     res, [mb], #16              ;// next row
4650c1bc742181ded4930842b46e9507372f0b1b963James Dong
4660c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp4 = |a4|a3|a2|a1|
4670c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp5 = |c4|c3|c2|c1|
4680c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp6 = |g4|g3|g2|g1|
4690c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp1 = |m4|m3|m2|m1|
4700c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp2 = |r4|r3|r2|r1|
4710c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp3 = |t4|t3|t2|t1|
4720c1bc742181ded4930842b46e9507372f0b1b963James Dong
4730c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// fourth four pixels
4740c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp6                  ;// |g3|g1|
4750c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp1            ;// |g3+m3|g1+m1|
4760c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp5                  ;// |c3|c1|
4770c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
4780c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp2            ;// |c3+r3|c1+r1|
4790c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
4800c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp4            ;// 16+20(G+M)+A
4810c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A+T
4820c1bc742181ded4930842b46e9507372f0b1b963James Dong
4830c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
4840c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
4850c1bc742181ded4930842b46e9507372f0b1b963James Dong
4860c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
4870c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     res, = 0x00FF00FF
4880c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp6, ROR #8          ;// |g4|g2|
4890c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// |g4+m4|g2+m2|
4900c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
4910c1bc742181ded4930842b46e9507372f0b1b963James Dong
4920c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
4930c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp5, ROR #8          ;// |c4|c2|
4940c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
4950c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp2, ROR #8    ;// |c4+r4|c2+r2|
4960c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// 16+20(G+M)+A
4970c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A+T
4980c1bc742181ded4930842b46e9507372f0b1b963James Dong
4990c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
5000c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
5010c1bc742181ded4930842b46e9507372f0b1b963James Dong
5020c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
5030c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp5, [mb]
5040c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp4, = 0xFF00FF00
5050c1bc742181ded4930842b46e9507372f0b1b963James Dong    MVN     tmp5, tmp5
5060c1bc742181ded4930842b46e9507372f0b1b963James Dong
5070c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmpa, tmp4, tmpb, LSL #3    ;// mask and divide by 32
5080c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR     res, res, tmpa
5090c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0x80808080
5100c1bc742181ded4930842b46e9507372f0b1b963James Dong    UHSUB8  res, res, tmp5              ;// bilinear interpolation
5110c1bc742181ded4930842b46e9507372f0b1b963James Dong
5120c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// decrement loop_x counter
5130c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUBS    count, count, #4<<28        ;// decrement x loop counter
5140c1bc742181ded4930842b46e9507372f0b1b963James Dong
5150c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// calculate "ref" address for next round
5160c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB     ref, ref, width, LSL #3     ;// ref -= 8*width;
5170c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref, ref, #4                ;// next column (4 pixels)
5180c1bc742181ded4930842b46e9507372f0b1b963James Dong
5190c1bc742181ded4930842b46e9507372f0b1b963James Dong    EOR     res, res, tmpa              ;// correct sign
5200c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     res, [mb], #-44
5210c1bc742181ded4930842b46e9507372f0b1b963James Dong
5220c1bc742181ded4930842b46e9507372f0b1b963James Dong    BCS     loop_x
5230c1bc742181ded4930842b46e9507372f0b1b963James Dong
5240c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADDS    mb, mb, #64                 ;// set Carry=0
5250c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref, ref, width, LSL #2     ;// ref += 4*width
5260c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmp6, count, #0x00F00000    ;// partWidth-1
5270c1bc742181ded4930842b46e9507372f0b1b963James Dong    SBC     ref, ref, tmp6, LSR #20     ;// -(partWidth-1)-1
5280c1bc742181ded4930842b46e9507372f0b1b963James Dong    SBC     mb, mb, tmp6, LSR #20       ;// -(partWidth-1)-1
5290c1bc742181ded4930842b46e9507372f0b1b963James Dong
5300c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADDS    count, count, #0xC << 24    ;// decrement y loop counter
5310c1bc742181ded4930842b46e9507372f0b1b963James Dong    BGE     loop_y
5320c1bc742181ded4930842b46e9507372f0b1b963James Dong
5330c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     sp, sp, #0x1f4
5340c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDMFD   sp!, {r4-r11, pc}
5350c1bc742181ded4930842b46e9507372f0b1b963James Dong
5360c1bc742181ded4930842b46e9507372f0b1b963James Dong    END
537