10c1bc742181ded4930842b46e9507372f0b1b963James Dong; Copyright (C) 2009 The Android Open Source Project
20c1bc742181ded4930842b46e9507372f0b1b963James Dong;
30c1bc742181ded4930842b46e9507372f0b1b963James Dong; Licensed under the Apache License, Version 2.0 (the "License");
40c1bc742181ded4930842b46e9507372f0b1b963James Dong; you may not use this file except in compliance with the License.
50c1bc742181ded4930842b46e9507372f0b1b963James Dong; You may obtain a copy of the License at
60c1bc742181ded4930842b46e9507372f0b1b963James Dong;
70c1bc742181ded4930842b46e9507372f0b1b963James Dong;      http://www.apache.org/licenses/LICENSE-2.0
80c1bc742181ded4930842b46e9507372f0b1b963James Dong;
90c1bc742181ded4930842b46e9507372f0b1b963James Dong; Unless required by applicable law or agreed to in writing, software
100c1bc742181ded4930842b46e9507372f0b1b963James Dong; distributed under the License is distributed on an "AS IS" BASIS,
110c1bc742181ded4930842b46e9507372f0b1b963James Dong; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
120c1bc742181ded4930842b46e9507372f0b1b963James Dong; See the License for the specific language governing permissions and
130c1bc742181ded4930842b46e9507372f0b1b963James Dong; limitations under the License.
140c1bc742181ded4930842b46e9507372f0b1b963James Dong
150c1bc742181ded4930842b46e9507372f0b1b963James Dong;-------------------------------------------------------------------------------
160c1bc742181ded4930842b46e9507372f0b1b963James Dong;--
170c1bc742181ded4930842b46e9507372f0b1b963James Dong;-- Abstract : ARMv6 optimized version of h264bsdInterpolateVerQuarter function
180c1bc742181ded4930842b46e9507372f0b1b963James Dong;--
190c1bc742181ded4930842b46e9507372f0b1b963James Dong;-------------------------------------------------------------------------------
200c1bc742181ded4930842b46e9507372f0b1b963James Dong
210c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF :DEF: H264DEC_WINASM
220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
230c1bc742181ded4930842b46e9507372f0b1b963James Dong    ELSE
240c1bc742181ded4930842b46e9507372f0b1b963James Dong        REQUIRE8
250c1bc742181ded4930842b46e9507372f0b1b963James Dong        PRESERVE8
260c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF
270c1bc742181ded4930842b46e9507372f0b1b963James Dong
280c1bc742181ded4930842b46e9507372f0b1b963James Dong    AREA    |.text|, CODE
290c1bc742181ded4930842b46e9507372f0b1b963James Dong
300c1bc742181ded4930842b46e9507372f0b1b963James Dong;// h264bsdInterpolateVerQuarter register allocation
310c1bc742181ded4930842b46e9507372f0b1b963James Dong
320c1bc742181ded4930842b46e9507372f0b1b963James Dongref     RN 0
330c1bc742181ded4930842b46e9507372f0b1b963James Dong
340c1bc742181ded4930842b46e9507372f0b1b963James Dongmb      RN 1
350c1bc742181ded4930842b46e9507372f0b1b963James Dongbuff    RN 1
360c1bc742181ded4930842b46e9507372f0b1b963James Dong
370c1bc742181ded4930842b46e9507372f0b1b963James Dongcount   RN 2
380c1bc742181ded4930842b46e9507372f0b1b963James Dongx0      RN 2
390c1bc742181ded4930842b46e9507372f0b1b963James Dong
400c1bc742181ded4930842b46e9507372f0b1b963James Dongres     RN 3
410c1bc742181ded4930842b46e9507372f0b1b963James Dongy0      RN 3
420c1bc742181ded4930842b46e9507372f0b1b963James Dong
430c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp1    RN 4
440c1bc742181ded4930842b46e9507372f0b1b963James Dong
450c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp2    RN 5
460c1bc742181ded4930842b46e9507372f0b1b963James Dongheight  RN 5
470c1bc742181ded4930842b46e9507372f0b1b963James Dong
480c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp3    RN 6
490c1bc742181ded4930842b46e9507372f0b1b963James DongpartW   RN 6
500c1bc742181ded4930842b46e9507372f0b1b963James Dong
510c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp4    RN 7
520c1bc742181ded4930842b46e9507372f0b1b963James DongpartH   RN 7
530c1bc742181ded4930842b46e9507372f0b1b963James Dong
540c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp5    RN 8
550c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp6    RN 9
560c1bc742181ded4930842b46e9507372f0b1b963James Dong
570c1bc742181ded4930842b46e9507372f0b1b963James Dongtmpa    RN 10
580c1bc742181ded4930842b46e9507372f0b1b963James Dongtmpb    RN 11
590c1bc742181ded4930842b46e9507372f0b1b963James Dongwidth   RN 12
600c1bc742181ded4930842b46e9507372f0b1b963James Dong
610c1bc742181ded4930842b46e9507372f0b1b963James Dongplus16  RN 14
620c1bc742181ded4930842b46e9507372f0b1b963James Dong
630c1bc742181ded4930842b46e9507372f0b1b963James Dong
640c1bc742181ded4930842b46e9507372f0b1b963James Dong;// function exports and imports
650c1bc742181ded4930842b46e9507372f0b1b963James Dong
660c1bc742181ded4930842b46e9507372f0b1b963James Dong    IMPORT  h264bsdFillBlock
670c1bc742181ded4930842b46e9507372f0b1b963James Dong
680c1bc742181ded4930842b46e9507372f0b1b963James Dong    EXPORT  h264bsdInterpolateVerQuarter
690c1bc742181ded4930842b46e9507372f0b1b963James Dong
700c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Approach to vertical interpolation
710c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
720c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Interpolation is done by using 32-bit loads and stores
730c1bc742181ded4930842b46e9507372f0b1b963James Dong;// and by using 16 bit arithmetic. 4x4 block is processed
740c1bc742181ded4930842b46e9507372f0b1b963James Dong;// in each round.
750c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
760c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
770c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
780c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
790c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
800c1bc742181ded4930842b46e9507372f0b1b963James Dong;//           ..
810c1bc742181ded4930842b46e9507372f0b1b963James Dong;//           ..
820c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |a_m1|a_m1|a_m1|a_m1|...
830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |b_m1|b_m1|b_m1|b_m1|...
840c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |c_m1|c_m1|c_m1|c_m1|...
850c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |d_m1|d_m1|d_m1|d_m1|...
860c1bc742181ded4930842b46e9507372f0b1b963James Dong
870c1bc742181ded4930842b46e9507372f0b1b963James Dongh264bsdInterpolateVerQuarter
880c1bc742181ded4930842b46e9507372f0b1b963James Dong    STMFD   sp!, {r0-r11, lr}
890c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB     sp, sp, #0x1e4
900c1bc742181ded4930842b46e9507372f0b1b963James Dong
910c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP     x0, #0
920c1bc742181ded4930842b46e9507372f0b1b963James Dong    BLT     do_fill                 ;// (x0 < 0)
930c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     partW, [sp,#0x220]      ;// partWidth
940c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp5, x0, partW         ;// (x0+partWidth)
950c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     width, [sp,#0x218]      ;// width
960c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP     tmp5, width
970c1bc742181ded4930842b46e9507372f0b1b963James Dong    BHI     do_fill                 ;// (x0+partW)>width
980c1bc742181ded4930842b46e9507372f0b1b963James Dong
990c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP     y0, #0
1000c1bc742181ded4930842b46e9507372f0b1b963James Dong    BLT     do_fill                 ;// (y0 < 0)
1010c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     partH, [sp,#0x224]      ;// partHeight
1020c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp6, y0, partH         ;// (y0+partHeight)
1030c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp6, tmp6, #5          ;// (y0+partH+5)
1040c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     height, [sp,#0x21c]     ;// height
1050c1bc742181ded4930842b46e9507372f0b1b963James Dong    CMP     tmp6, height
1060c1bc742181ded4930842b46e9507372f0b1b963James Dong    BLS     skip_fill               ;// no overfill needed
1070c1bc742181ded4930842b46e9507372f0b1b963James Dong
1080c1bc742181ded4930842b46e9507372f0b1b963James Dong
1090c1bc742181ded4930842b46e9507372f0b1b963James Dongdo_fill
1100c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     partH, [sp,#0x224]      ;// partHeight
1110c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp5, partH, #5         ;// r2 = partH + 5;
1120c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     height, [sp,#0x21c]     ;// height
1130c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     partW, [sp,#0x220]      ;// partWidth
1140c1bc742181ded4930842b46e9507372f0b1b963James Dong    STMIB   sp, {height, partW}     ;// sp+4 = height, sp+8 = partWidth
1150c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     tmp5, [sp,#0xc]         ;// sp+c partHeight+5
1160c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     partW, [sp,#0x10]       ;// sp+10 = partWidth
1170c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     width, [sp,#0x218]      ;// width
1180c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     width, [sp,#0]          ;// sp+0 = width
1190c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
1200c1bc742181ded4930842b46e9507372f0b1b963James Dong    BL      h264bsdFillBlock
1210c1bc742181ded4930842b46e9507372f0b1b963James Dong
1220c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     x0, #0
1230c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     x0,[sp,#0x1ec]          ;// x0 = 0
1240c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     x0,[sp,#0x1f0]          ;// y0 = 0
1250c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref,sp,#0x28            ;// ref = p1
1260c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     partW, [sp,#0x218]
1270c1bc742181ded4930842b46e9507372f0b1b963James Dong
1280c1bc742181ded4930842b46e9507372f0b1b963James Dong
1290c1bc742181ded4930842b46e9507372f0b1b963James Dongskip_fill
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     x0 ,[sp,#0x1ec]         ;// x0
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     y0 ,[sp,#0x1f0]         ;// y0
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     width, [sp,#0x218]      ;// width
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong    MLA     tmp6, width, y0, x0     ;// y0*width+x0
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref, ref, tmp6          ;// ref += y0*width+x0
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     mb, [sp, #0x1e8]        ;// mb
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     count, partW, partH, LSL #8    ;// |xx|xx|partH|partW|
1380c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp5, = 0x00010100
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong    RSB     count, tmp5, count, LSL #8      ;// |xx|partH-1|partW-1|xx|
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp2, [sp, #0x228]      ;// verOffset
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     count, count, tmp2      ;// |xx|partH-1|partW-1|verOffset|
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     plus16, = 0x00100010
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmp1, count, #0x0000FF00 ;// partWidth
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong
1470c1bc742181ded4930842b46e9507372f0b1b963James Dongloop_y
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     count, count, tmp1, LSL #16  ;// partWidth-1 to top byte
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong
1500c1bc742181ded4930842b46e9507372f0b1b963James Dongloop_x
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp1, [ref], width     ;// |a4|a3|a2|a1|
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp2, [ref], width     ;// |c4|c3|c2|c1|
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp3, [ref], width     ;// |g4|g3|g2|g1|
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp4, [ref], width     ;// |m4|m3|m2|m1|
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp5, [ref], width     ;// |r4|r3|r2|r1|
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp6, [ref], width     ;// |t4|t3|t2|t1|
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// first four pixels
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp3                  ;// |g3|g1|
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp4            ;// |g3+m3|g1+m1|
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp2                  ;// |c3|c1|
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp5            ;// |c3+r3|c1+r1|
1650c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp6            ;// 16+20(G+M)+A+T
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong
1720c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
1730c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     res, = 0x00FF00FF
1740c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp3, ROR #8          ;// |g4|g2|
1750c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// |g4+m4|g2+m2|
1760c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
1770c1bc742181ded4930842b46e9507372f0b1b963James Dong
1780c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
1790c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp2, ROR #8          ;// |c4|c2|
1800c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
1810c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp5, ROR #8    ;// |c4+r4|c2+r2|
1820c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A
1830c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// 16+20(G+M)+A+T
1840c1bc742181ded4930842b46e9507372f0b1b963James Dong
1850c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
1860c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
1870c1bc742181ded4930842b46e9507372f0b1b963James Dong
1880c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
1890c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOVS    tmp1, count, LSL #31        ;// update flags (verOffset)
1900c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0xFF00FF00
1910c1bc742181ded4930842b46e9507372f0b1b963James Dong    MVNEQ   tmp1, tmp3                  ;// select verOffset=0
1920c1bc742181ded4930842b46e9507372f0b1b963James Dong    MVNNE   tmp1, tmp4                  ;// select verOffset=1
1930c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divede by 32
1940c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR     res, res, tmpa
1950c1bc742181ded4930842b46e9507372f0b1b963James Dong
1960c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0x80808080
1970c1bc742181ded4930842b46e9507372f0b1b963James Dong    UHSUB8  res, res, tmp1              ;// bilinear interpolation
1980c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp1, [ref], width          ;// load next row
1990c1bc742181ded4930842b46e9507372f0b1b963James Dong    EOR     res, res, tmpa              ;// correct sign
2000c1bc742181ded4930842b46e9507372f0b1b963James Dong
2010c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     res, [mb], #16              ;// next row (mb)
2020c1bc742181ded4930842b46e9507372f0b1b963James Dong
2030c1bc742181ded4930842b46e9507372f0b1b963James Dong
2040c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp2 = |a4|a3|a2|a1|
2050c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp3 = |c4|c3|c2|c1|
2060c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp4 = |g4|g3|g2|g1|
2070c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp5 = |m4|m3|m2|m1|
2080c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp6 = |r4|r3|r2|r1|
2090c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp1 = |t4|t3|t2|t1|
2100c1bc742181ded4930842b46e9507372f0b1b963James Dong
2110c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// second four pixels
2120c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp4                  ;// |g3|g1|
2130c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp5            ;// |g3+m3|g1+m1|
2140c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp3                  ;// |c3|c1|
2150c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
2160c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp6            ;// |c3+r3|c1+r1|
2170c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
2180c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A
2190c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A+T
2200c1bc742181ded4930842b46e9507372f0b1b963James Dong
2210c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
2220c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
2230c1bc742181ded4930842b46e9507372f0b1b963James Dong
2240c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
2250c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     res, = 0x00FF00FF
2260c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp4, ROR #8          ;// |g4|g2|
2270c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp5, ROR #8    ;// |g4+m4|g2+m2|
2280c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
2290c1bc742181ded4930842b46e9507372f0b1b963James Dong
2300c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
2310c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp3, ROR #8          ;// |c4|c2|
2320c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
2330c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp6, ROR #8    ;// |c4+r4|c2+r2|
2340c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A
2350c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A+T
2360c1bc742181ded4930842b46e9507372f0b1b963James Dong
2370c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
2380c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
2390c1bc742181ded4930842b46e9507372f0b1b963James Dong
2400c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
2410c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0xFF00FF00
2420c1bc742181ded4930842b46e9507372f0b1b963James Dong    MVNEQ   tmp2, tmp4                  ;// select verOffset=0
2430c1bc742181ded4930842b46e9507372f0b1b963James Dong    MVNNE   tmp2, tmp5                  ;// select verOffset=1
2440c1bc742181ded4930842b46e9507372f0b1b963James Dong
2450c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
2460c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR     res, res, tmpa
2470c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0x80808080
2480c1bc742181ded4930842b46e9507372f0b1b963James Dong    UHSUB8  res, res, tmp2              ;// bilinear interpolation
2490c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp2, [ref], width          ;// load next row
2500c1bc742181ded4930842b46e9507372f0b1b963James Dong    EOR     res, res, tmpa              ;// correct sign
2510c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     res, [mb], #16              ;// next row
2520c1bc742181ded4930842b46e9507372f0b1b963James Dong
2530c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp3 = |a4|a3|a2|a1|
2540c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp4 = |c4|c3|c2|c1|
2550c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp5 = |g4|g3|g2|g1|
2560c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp6 = |m4|m3|m2|m1|
2570c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp1 = |r4|r3|r2|r1|
2580c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp2 = |t4|t3|t2|t1|
2590c1bc742181ded4930842b46e9507372f0b1b963James Dong
2600c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// third four pixels
2610c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp5                  ;// |g3|g1|
2620c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp6            ;// |g3+m3|g1+m1|
2630c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp4                  ;// |c3|c1|
2640c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
2650c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp1            ;// |c3+r3|c1+r1|
2660c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
2670c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A
2680c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A+T
2690c1bc742181ded4930842b46e9507372f0b1b963James Dong
2700c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
2710c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
2720c1bc742181ded4930842b46e9507372f0b1b963James Dong
2730c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
2740c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     res, = 0x00FF00FF
2750c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp5, ROR #8          ;// |g4|g2|
2760c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// |g4+m4|g2+m2|
2770c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
2780c1bc742181ded4930842b46e9507372f0b1b963James Dong
2790c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
2800c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp4, ROR #8          ;// |c4|c2|
2810c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
2820c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp1, ROR #8    ;// |c4+r4|c2+r2|
2830c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A
2840c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A+T
2850c1bc742181ded4930842b46e9507372f0b1b963James Dong
2860c1bc742181ded4930842b46e9507372f0b1b963James Dong
2870c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
2880c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
2890c1bc742181ded4930842b46e9507372f0b1b963James Dong
2900c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
2910c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0xFF00FF00
2920c1bc742181ded4930842b46e9507372f0b1b963James Dong    MVNEQ   tmp3, tmp5                  ;// select verOffset=0
2930c1bc742181ded4930842b46e9507372f0b1b963James Dong    MVNNE   tmp3, tmp6                  ;// select verOffset=1
2940c1bc742181ded4930842b46e9507372f0b1b963James Dong
2950c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
2960c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR     res, res, tmpa
2970c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0x80808080
2980c1bc742181ded4930842b46e9507372f0b1b963James Dong    UHSUB8  res, res, tmp3              ;// bilinear interpolation
2990c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp3, [ref]                 ;// load next row
3000c1bc742181ded4930842b46e9507372f0b1b963James Dong    EOR     res, res, tmpa              ;// correct sign
3010c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     res, [mb], #16              ;// next row
3020c1bc742181ded4930842b46e9507372f0b1b963James Dong
3030c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp4 = |a4|a3|a2|a1|
3040c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp5 = |c4|c3|c2|c1|
3050c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp6 = |g4|g3|g2|g1|
3060c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp1 = |m4|m3|m2|m1|
3070c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp2 = |r4|r3|r2|r1|
3080c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// tmp3 = |t4|t3|t2|t1|
3090c1bc742181ded4930842b46e9507372f0b1b963James Dong
3100c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// fourth four pixels
3110c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp6                  ;// |g3|g1|
3120c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp1            ;// |g3+m3|g1+m1|
3130c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp5                  ;// |c3|c1|
3140c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
3150c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp2            ;// |c3+r3|c1+r1|
3160c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
3170c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp4            ;// 16+20(G+M)+A
3180c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A+T
3190c1bc742181ded4930842b46e9507372f0b1b963James Dong
3200c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
3210c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
3220c1bc742181ded4930842b46e9507372f0b1b963James Dong
3230c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
3240c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     res, = 0x00FF00FF
3250c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpa, tmp6, ROR #8          ;// |g4|g2|
3260c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// |g4+m4|g2+m2|
3270c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
3280c1bc742181ded4930842b46e9507372f0b1b963James Dong
3290c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
3300c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTB16  tmpb, tmp5, ROR #8          ;// |c4|c2|
3310c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
3320c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpb, tmpb, tmp2, ROR #8    ;// |c4+r4|c2+r2|
3330c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// 16+20(G+M)+A
3340c1bc742181ded4930842b46e9507372f0b1b963James Dong    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A+T
3350c1bc742181ded4930842b46e9507372f0b1b963James Dong
3360c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
3370c1bc742181ded4930842b46e9507372f0b1b963James Dong    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
3380c1bc742181ded4930842b46e9507372f0b1b963James Dong
3390c1bc742181ded4930842b46e9507372f0b1b963James Dong    USAT16  tmpb, #13, tmpa             ;// saturate
3400c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmp4, = 0xFF00FF00
3410c1bc742181ded4930842b46e9507372f0b1b963James Dong    MVNEQ   tmp5, tmp6                  ;// select verOffset=0
3420c1bc742181ded4930842b46e9507372f0b1b963James Dong    MVNNE   tmp5, tmp1                  ;// select verOffset=1
3430c1bc742181ded4930842b46e9507372f0b1b963James Dong
3440c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmpa, tmp4, tmpb, LSL #3    ;// mask and divide by 32
3450c1bc742181ded4930842b46e9507372f0b1b963James Dong    ORR     res, res, tmpa
3460c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDR     tmpa, = 0x80808080
3470c1bc742181ded4930842b46e9507372f0b1b963James Dong    UHSUB8  res, res, tmp5              ;// bilinear interpolation
3480c1bc742181ded4930842b46e9507372f0b1b963James Dong
3490c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// decrement loop_x counter
3500c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUBS    count, count, #4<<24        ;// (partWidth-1) -= 4;
3510c1bc742181ded4930842b46e9507372f0b1b963James Dong
3520c1bc742181ded4930842b46e9507372f0b1b963James Dong    ;// calculate "ref" address for next round
3530c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB     ref, ref, width, LSL #3     ;// ref -= 8*width;
3540c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref, ref, #4;               ;// next column (4 pixels)
3550c1bc742181ded4930842b46e9507372f0b1b963James Dong
3560c1bc742181ded4930842b46e9507372f0b1b963James Dong    EOR     res, res, tmpa              ;// correct sign
3570c1bc742181ded4930842b46e9507372f0b1b963James Dong    STR     res, [mb], #-44
3580c1bc742181ded4930842b46e9507372f0b1b963James Dong
3590c1bc742181ded4930842b46e9507372f0b1b963James Dong    BCS     loop_x
3600c1bc742181ded4930842b46e9507372f0b1b963James Dong
3610c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADDS    count, count, #252<<16      ;// (partHeight-1) -= 4;
3620c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     ref, ref, width, LSL #2     ;// ref += 4*width
3630c1bc742181ded4930842b46e9507372f0b1b963James Dong    AND     tmp1, count, #0x0000FF00    ;// partWidth-1
3640c1bc742181ded4930842b46e9507372f0b1b963James Dong    MOV     tmp2, #1
3650c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     tmp2, tmp2, tmp1, LSR #8    ;// partWidth
3660c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB     ref, ref, tmp2              ;// ref -= partWidth
3670c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     mb, mb, #64;
3680c1bc742181ded4930842b46e9507372f0b1b963James Dong    SUB     mb, mb, tmp2;               ;// mb -= partWidth
3690c1bc742181ded4930842b46e9507372f0b1b963James Dong    BGE     loop_y
3700c1bc742181ded4930842b46e9507372f0b1b963James Dong
3710c1bc742181ded4930842b46e9507372f0b1b963James Dong    ADD     sp,sp,#0x1f4
3720c1bc742181ded4930842b46e9507372f0b1b963James Dong    LDMFD   sp!, {r4-r11, pc}
3730c1bc742181ded4930842b46e9507372f0b1b963James Dong
3740c1bc742181ded4930842b46e9507372f0b1b963James Dong    END
375