10c1bc742181ded4930842b46e9507372f0b1b963James Dong; Copyright (C) 2009 The Android Open Source Project
20c1bc742181ded4930842b46e9507372f0b1b963James Dong;
30c1bc742181ded4930842b46e9507372f0b1b963James Dong; Licensed under the Apache License, Version 2.0 (the "License");
40c1bc742181ded4930842b46e9507372f0b1b963James Dong; you may not use this file except in compliance with the License.
50c1bc742181ded4930842b46e9507372f0b1b963James Dong; You may obtain a copy of the License at
60c1bc742181ded4930842b46e9507372f0b1b963James Dong;
70c1bc742181ded4930842b46e9507372f0b1b963James Dong;      http://www.apache.org/licenses/LICENSE-2.0
80c1bc742181ded4930842b46e9507372f0b1b963James Dong;
90c1bc742181ded4930842b46e9507372f0b1b963James Dong; Unless required by applicable law or agreed to in writing, software
100c1bc742181ded4930842b46e9507372f0b1b963James Dong; distributed under the License is distributed on an "AS IS" BASIS,
110c1bc742181ded4930842b46e9507372f0b1b963James Dong; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
120c1bc742181ded4930842b46e9507372f0b1b963James Dong; See the License for the specific language governing permissions and
130c1bc742181ded4930842b46e9507372f0b1b963James Dong; limitations under the License.
140c1bc742181ded4930842b46e9507372f0b1b963James Dong
150c1bc742181ded4930842b46e9507372f0b1b963James Dong;-------------------------------------------------------------------------------
160c1bc742181ded4930842b46e9507372f0b1b963James Dong;--
170c1bc742181ded4930842b46e9507372f0b1b963James Dong;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorHalf function
180c1bc742181ded4930842b46e9507372f0b1b963James Dong;--
190c1bc742181ded4930842b46e9507372f0b1b963James Dong;-------------------------------------------------------------------------------
200c1bc742181ded4930842b46e9507372f0b1b963James Dong
210c1bc742181ded4930842b46e9507372f0b1b963James Dong
220c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF :DEF: H264DEC_WINASM
230c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
240c1bc742181ded4930842b46e9507372f0b1b963James Dong    ELSE
250c1bc742181ded4930842b46e9507372f0b1b963James Dong        REQUIRE8
260c1bc742181ded4930842b46e9507372f0b1b963James Dong        PRESERVE8
270c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF
280c1bc742181ded4930842b46e9507372f0b1b963James Dong
290c1bc742181ded4930842b46e9507372f0b1b963James Dong    AREA    |.text|, CODE
300c1bc742181ded4930842b46e9507372f0b1b963James Dong
310c1bc742181ded4930842b46e9507372f0b1b963James Dong;// h264bsdInterpolateHorHalf register allocation
320c1bc742181ded4930842b46e9507372f0b1b963James Dong
330c1bc742181ded4930842b46e9507372f0b1b963James Dongref     RN 0
340c1bc742181ded4930842b46e9507372f0b1b963James Dong
350c1bc742181ded4930842b46e9507372f0b1b963James Dongmb      RN 1
360c1bc742181ded4930842b46e9507372f0b1b963James Dongbuff    RN 1
370c1bc742181ded4930842b46e9507372f0b1b963James Dong
380c1bc742181ded4930842b46e9507372f0b1b963James Dongcount   RN 2
390c1bc742181ded4930842b46e9507372f0b1b963James Dongx0      RN 2
400c1bc742181ded4930842b46e9507372f0b1b963James Dong
410c1bc742181ded4930842b46e9507372f0b1b963James Dongy0      RN 3
420c1bc742181ded4930842b46e9507372f0b1b963James Dongx_2_0   RN 3
430c1bc742181ded4930842b46e9507372f0b1b963James Dong
440c1bc742181ded4930842b46e9507372f0b1b963James Dongwidth   RN 4
450c1bc742181ded4930842b46e9507372f0b1b963James Dongx_3_1   RN 4
460c1bc742181ded4930842b46e9507372f0b1b963James Dong
470c1bc742181ded4930842b46e9507372f0b1b963James Dongheight  RN 5
480c1bc742181ded4930842b46e9507372f0b1b963James Dongx_6_4   RN 5
490c1bc742181ded4930842b46e9507372f0b1b963James Dong
500c1bc742181ded4930842b46e9507372f0b1b963James DongpartW   RN 6
510c1bc742181ded4930842b46e9507372f0b1b963James Dongx_7_5   RN 6
520c1bc742181ded4930842b46e9507372f0b1b963James Dong
530c1bc742181ded4930842b46e9507372f0b1b963James DongpartH   RN 7
540c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp1    RN 7
550c1bc742181ded4930842b46e9507372f0b1b963James Dong
560c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp2    RN 8
570c1bc742181ded4930842b46e9507372f0b1b963James Dong
580c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp3    RN 9
590c1bc742181ded4930842b46e9507372f0b1b963James Dong
600c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp4    RN 10
610c1bc742181ded4930842b46e9507372f0b1b963James Dong
620c1bc742181ded4930842b46e9507372f0b1b963James Dongmult_20_01  RN 11
630c1bc742181ded4930842b46e9507372f0b1b963James Dongmult_20_m5  RN 12
640c1bc742181ded4930842b46e9507372f0b1b963James Dong
650c1bc742181ded4930842b46e9507372f0b1b963James Dongplus16  RN 14
660c1bc742181ded4930842b46e9507372f0b1b963James Dong
670c1bc742181ded4930842b46e9507372f0b1b963James Dong
680c1bc742181ded4930842b46e9507372f0b1b963James Dong;// function exports and imports
690c1bc742181ded4930842b46e9507372f0b1b963James Dong
700c1bc742181ded4930842b46e9507372f0b1b963James Dong    IMPORT  h264bsdFillBlock
710c1bc742181ded4930842b46e9507372f0b1b963James Dong
720c1bc742181ded4930842b46e9507372f0b1b963James Dong    EXPORT  h264bsdInterpolateHorHalf
730c1bc742181ded4930842b46e9507372f0b1b963James Dong
740c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Horizontal filter approach
750c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
760c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Basic idea in horizontal filtering is to adjust coefficients
770c1bc742181ded4930842b46e9507372f0b1b963James Dong;// like below. Calculation is done with 16-bit maths.
780c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
790c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
800c1bc742181ded4930842b46e9507372f0b1b963James Dong;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
810c1bc742181ded4930842b46e9507372f0b1b963James Dong;// y_0 =   20  1     20 -5        -5         1
820c1bc742181ded4930842b46e9507372f0b1b963James Dong;// y_1 =   -5        20  1      1 20        -5
830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// y_2 =    1        -5        -5 20      1 20
840c1bc742181ded4930842b46e9507372f0b1b963James Dong;// y_3 =              1        20 -5     -5 20         1
850c1bc742181ded4930842b46e9507372f0b1b963James Dong
860c1bc742181ded4930842b46e9507372f0b1b963James Dong
870c1bc742181ded4930842b46e9507372f0b1b963James Dongh264bsdInterpolateHorHalf
880c1bc742181ded4930842b46e9507372f0b1b963James Dong    STMFD   sp!, {r0-r11, lr}
890c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB     sp, sp, #0x1e4
900c1bc742181ded4930842b46e9507372f0b1b963James Dong
910c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP     x0, #0
920c1bc742181ded4930842b46e9507372f0b1b963James Dong    BLT     do_fill                 ;// (x0 < 0)
930c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     partW, [sp,#0x220]      ;// partWidth
940c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp4, x0, partW         ;// (x0+partWidth)
950c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp4, tmp4, #5          ;// (y0+partW+5)
960c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     width, [sp,#0x218]      ;// width
970c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP     tmp4, width
980c1bc742181ded4930842b46e9507372f0b1b963James Dong    BHI     do_fill                 ;// (x0+partW)>width
990c1bc742181ded4930842b46e9507372f0b1b963James Dong
1000c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP     y0, #0
1010c1bc742181ded4930842b46e9507372f0b1b963James Dong    BLT     do_fill                 ;// (y0 < 0)
1020c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     partH, [sp,#0x224]      ;// partHeight
1030c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp2, y0, partH         ;// (y0+partHeight)
1040c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     height, [sp,#0x21c]     ;// height
1050c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP     tmp2, height
1060c1bc742181ded4930842b46e9507372f0b1b963James Dong    BLS     skip_fill               ;// no overfill needed
1070c1bc742181ded4930842b46e9507372f0b1b963James Dong
1080c1bc742181ded4930842b46e9507372f0b1b963James Dong
1090c1bc742181ded4930842b46e9507372f0b1b963James Dongdo_fill
1100c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     partH, [sp,#0x224]      ;// partHeight
1110c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     height, [sp,#0x21c]     ;// height
1120c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     partW, [sp,#0x220]      ;// partWidth
1130c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp4, partW, #5         ;// tmp4 = partW + 5;
1140c1bc742181ded4930842b46e9507372f0b1b963James Dong    STMIB   sp, {height, tmp4}      ;// sp+4 = height, sp+8 = partWidth+5
1150c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     partH, [sp,#0xc]        ;// sp+c = partHeight
1160c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     tmp4, [sp,#0x10]        ;// sp+10 = partWidth+5
1170c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     width, [sp,#0x218]      ;// width
1180c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     width, [sp,#0]          ;// sp+0 = width
1190c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
1200c1bc742181ded4930842b46e9507372f0b1b963James Dong    BL      h264bsdFillBlock
1210c1bc742181ded4930842b46e9507372f0b1b963James Dong
1220c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     x0, #0
1230c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     x0,[sp,#0x1ec]          ;// x0 = 0
1240c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     x0,[sp,#0x1f0]          ;// y0 = 0
1250c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref,sp,#0x28            ;// ref = p1
1260c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     tmp4, [sp,#0x218]       ;// width = partWidth+5
1270c1bc742181ded4930842b46e9507372f0b1b963James Dong
1280c1bc742181ded4930842b46e9507372f0b1b963James Dong
1290c1bc742181ded4930842b46e9507372f0b1b963James Dongskip_fill
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     x0 ,[sp,#0x1ec]         ;// x0
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     y0 ,[sp,#0x1f0]         ;// y0
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     width, [sp,#0x218]      ;// width
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong    MLA     tmp2, width, y0, x0     ;// y0*width+x0
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref, ref, tmp2          ;// ref += y0*width+x0
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref, ref, #8            ;// ref = ref+8
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     mb, [sp, #0x1e8]        ;// mb
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong
1380c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// pack values to count register
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// [31:28] loop_x (partWidth-1)
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// [27:24] loop_y (partHeight-1)
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// [23:20] partWidth-1
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// [19:16] partHeight-1
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// [15:00] width
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     count, width
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB     partW, partW, #1;
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB     partH, partH, #1;
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp2, partH, partW, LSL #4
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     count, count, tmp2, LSL #16
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     mult_20_01, = 0x00140001
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     mult_20_m5, = 0x0014FFFB
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     plus16, #16
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmp1, count, #0x000F0000    ;// partHeight-1
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmp3, count, #0x00F00000    ;// partWidth-1
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     count, count, tmp1, LSL #8
1570c1bc742181ded4930842b46e9507372f0b1b963James Dongloop_y
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     x_3_1, [ref, #-8]
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     count, count, tmp3, LSL #8
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     x_7_5, [ref, #-4]
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_2_0, x_3_1
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_3_1, x_3_1, ROR #8
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_6_4, x_7_5
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong
1650c1bc742181ded4930842b46e9507372f0b1b963James Dongloop_x
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_7_5, x_7_5, ROR #8
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp1, x_2_0, mult_20_01, plus16
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp3, x_2_0, mult_20_01, plus16
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp2, x_2_0, mult_20_m5, plus16
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp4, x_3_1, mult_20_01, plus16
1720c1bc742181ded4930842b46e9507372f0b1b963James Dong
1730c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
1740c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
1750c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp2, x_3_1, mult_20_01, tmp2
1760c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     x_3_1, [ref], #4
1770c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp4, x_6_4, mult_20_m5, tmp4
1780c1bc742181ded4930842b46e9507372f0b1b963James Dong
1790c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
1800c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
1810c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp2, x_6_4, mult_20_01, tmp2
1820c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp4, x_7_5, mult_20_m5, tmp4
1830c1bc742181ded4930842b46e9507372f0b1b963James Dong
1840c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp1, x_7_5, mult_20_01, tmp1
1850c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_2_0, x_3_1
1860c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
1870c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp3, x_7_5, mult_20_01, tmp3
1880c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp4, x_2_0, mult_20_01, tmp4
1890c1bc742181ded4930842b46e9507372f0b1b963James Dong
1900c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     tmp2, tmp2, ASR #5
1910c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     tmp1, tmp1, ASR #5
1920c1bc742181ded4930842b46e9507372f0b1b963James Dong    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
1930c1bc742181ded4930842b46e9507372f0b1b963James Dong    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
1940c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmp2, #8, tmp2
1950c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmp1, #8, tmp1
1960c1bc742181ded4930842b46e9507372f0b1b963James Dong
1970c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUBS    count, count, #4<<28
1980c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR     tmp1, tmp1, tmp2, LSL #8
1990c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     tmp1, [mb], #4
2000c1bc742181ded4930842b46e9507372f0b1b963James Dong    BCC     next_y
2010c1bc742181ded4930842b46e9507372f0b1b963James Dong
2020c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_3_1, x_3_1, ROR #8
2030c1bc742181ded4930842b46e9507372f0b1b963James Dong
2040c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp1, x_6_4, mult_20_01, plus16
2050c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp3, x_6_4, mult_20_01, plus16
2060c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp2, x_6_4, mult_20_m5, plus16
2070c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp4, x_7_5, mult_20_01, plus16
2080c1bc742181ded4930842b46e9507372f0b1b963James Dong
2090c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
2100c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
2110c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp2, x_7_5, mult_20_01, tmp2
2120c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     x_7_5, [ref], #4
2130c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLAD   tmp4, x_2_0, mult_20_m5, tmp4
2140c1bc742181ded4930842b46e9507372f0b1b963James Dong
2150c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
2160c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
2170c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp2, x_2_0, mult_20_01, tmp2
2180c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp4, x_3_1, mult_20_m5, tmp4
2190c1bc742181ded4930842b46e9507372f0b1b963James Dong
2200c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp1, x_3_1, mult_20_01, tmp1
2210c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  x_6_4, x_7_5
2220c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
2230c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLADX  tmp3, x_3_1, mult_20_01, tmp3
2240c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  tmp4, x_6_4, mult_20_01, tmp4
2250c1bc742181ded4930842b46e9507372f0b1b963James Dong
2260c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     tmp2, tmp2, ASR #5
2270c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     tmp1, tmp1, ASR #5
2280c1bc742181ded4930842b46e9507372f0b1b963James Dong    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
2290c1bc742181ded4930842b46e9507372f0b1b963James Dong    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
2300c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmp2, #8, tmp2
2310c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmp1, #8, tmp1
2320c1bc742181ded4930842b46e9507372f0b1b963James Dong
2330c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUBS    count, count, #4<<28
2340c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR     tmp1, tmp1, tmp2, LSL #8
2350c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     tmp1, [mb], #4
2360c1bc742181ded4930842b46e9507372f0b1b963James Dong    BCS     loop_x
2370c1bc742181ded4930842b46e9507372f0b1b963James Dong
2380c1bc742181ded4930842b46e9507372f0b1b963James Dongnext_y
2390c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmp3, count, #0x00F00000    ;// partWidth-1
2400c1bc742181ded4930842b46e9507372f0b1b963James Dong    SMLABB  ref, count, mult_20_01, ref ;// +width
2410c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADDS    mb, mb, #16                 ;// +16, Carry=0
2420c1bc742181ded4930842b46e9507372f0b1b963James Dong    SBC     mb, mb, tmp3, LSR #20       ;// -(partWidth-1)-1
2430c1bc742181ded4930842b46e9507372f0b1b963James Dong    SBC     ref, ref, tmp3, LSR #20     ;// -(partWidth-1)-1
2440c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADDS    count, count, #(1<<28)-(1<<24)
2450c1bc742181ded4930842b46e9507372f0b1b963James Dong    BGE     loop_y
2460c1bc742181ded4930842b46e9507372f0b1b963James Dong
2470c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     sp,sp,#0x1f4
2480c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDMFD   sp!, {r4-r11, pc}
2490c1bc742181ded4930842b46e9507372f0b1b963James Dong
2500c1bc742181ded4930842b46e9507372f0b1b963James Dong    END
2510c1bc742181ded4930842b46e9507372f0b1b963James Dong
252