10c1bc742181ded4930842b46e9507372f0b1b963James Dong; Copyright (C) 2009 The Android Open Source Project 20c1bc742181ded4930842b46e9507372f0b1b963James Dong; 30c1bc742181ded4930842b46e9507372f0b1b963James Dong; Licensed under the Apache License, Version 2.0 (the "License"); 40c1bc742181ded4930842b46e9507372f0b1b963James Dong; you may not use this file except in compliance with the License. 50c1bc742181ded4930842b46e9507372f0b1b963James Dong; You may obtain a copy of the License at 60c1bc742181ded4930842b46e9507372f0b1b963James Dong; 70c1bc742181ded4930842b46e9507372f0b1b963James Dong; http://www.apache.org/licenses/LICENSE-2.0 80c1bc742181ded4930842b46e9507372f0b1b963James Dong; 90c1bc742181ded4930842b46e9507372f0b1b963James Dong; Unless required by applicable law or agreed to in writing, software 100c1bc742181ded4930842b46e9507372f0b1b963James Dong; distributed under the License is distributed on an "AS IS" BASIS, 110c1bc742181ded4930842b46e9507372f0b1b963James Dong; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 120c1bc742181ded4930842b46e9507372f0b1b963James Dong; See the License for the specific language governing permissions and 130c1bc742181ded4930842b46e9507372f0b1b963James Dong; limitations under the License. 140c1bc742181ded4930842b46e9507372f0b1b963James Dong 150c1bc742181ded4930842b46e9507372f0b1b963James Dong;------------------------------------------------------------------------------- 160c1bc742181ded4930842b46e9507372f0b1b963James Dong;-- 170c1bc742181ded4930842b46e9507372f0b1b963James Dong;-- Abstract : ARMv6 optimized version of h264bsdInterpolateVerQuarter function 180c1bc742181ded4930842b46e9507372f0b1b963James Dong;-- 190c1bc742181ded4930842b46e9507372f0b1b963James Dong;------------------------------------------------------------------------------- 200c1bc742181ded4930842b46e9507372f0b1b963James Dong 210c1bc742181ded4930842b46e9507372f0b1b963James Dong IF :DEF: H264DEC_WINASM 220c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// We dont use REQUIRE8 and PRESERVE8 for winasm 230c1bc742181ded4930842b46e9507372f0b1b963James Dong ELSE 240c1bc742181ded4930842b46e9507372f0b1b963James Dong REQUIRE8 250c1bc742181ded4930842b46e9507372f0b1b963James Dong PRESERVE8 260c1bc742181ded4930842b46e9507372f0b1b963James Dong ENDIF 270c1bc742181ded4930842b46e9507372f0b1b963James Dong 280c1bc742181ded4930842b46e9507372f0b1b963James Dong AREA |.text|, CODE 290c1bc742181ded4930842b46e9507372f0b1b963James Dong 300c1bc742181ded4930842b46e9507372f0b1b963James Dong;// h264bsdInterpolateVerQuarter register allocation 310c1bc742181ded4930842b46e9507372f0b1b963James Dong 320c1bc742181ded4930842b46e9507372f0b1b963James Dongref RN 0 330c1bc742181ded4930842b46e9507372f0b1b963James Dong 340c1bc742181ded4930842b46e9507372f0b1b963James Dongmb RN 1 350c1bc742181ded4930842b46e9507372f0b1b963James Dongbuff RN 1 360c1bc742181ded4930842b46e9507372f0b1b963James Dong 370c1bc742181ded4930842b46e9507372f0b1b963James Dongcount RN 2 380c1bc742181ded4930842b46e9507372f0b1b963James Dongx0 RN 2 390c1bc742181ded4930842b46e9507372f0b1b963James Dong 400c1bc742181ded4930842b46e9507372f0b1b963James Dongres RN 3 410c1bc742181ded4930842b46e9507372f0b1b963James Dongy0 RN 3 420c1bc742181ded4930842b46e9507372f0b1b963James Dong 430c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp1 RN 4 440c1bc742181ded4930842b46e9507372f0b1b963James Dong 450c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp2 RN 5 460c1bc742181ded4930842b46e9507372f0b1b963James Dongheight RN 5 470c1bc742181ded4930842b46e9507372f0b1b963James Dong 480c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp3 RN 6 490c1bc742181ded4930842b46e9507372f0b1b963James DongpartW RN 6 500c1bc742181ded4930842b46e9507372f0b1b963James Dong 510c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp4 RN 7 520c1bc742181ded4930842b46e9507372f0b1b963James DongpartH RN 7 530c1bc742181ded4930842b46e9507372f0b1b963James Dong 540c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp5 RN 8 550c1bc742181ded4930842b46e9507372f0b1b963James Dongtmp6 RN 9 560c1bc742181ded4930842b46e9507372f0b1b963James Dong 570c1bc742181ded4930842b46e9507372f0b1b963James Dongtmpa RN 10 580c1bc742181ded4930842b46e9507372f0b1b963James Dongtmpb RN 11 590c1bc742181ded4930842b46e9507372f0b1b963James Dongwidth RN 12 600c1bc742181ded4930842b46e9507372f0b1b963James Dong 610c1bc742181ded4930842b46e9507372f0b1b963James Dongplus16 RN 14 620c1bc742181ded4930842b46e9507372f0b1b963James Dong 630c1bc742181ded4930842b46e9507372f0b1b963James Dong 640c1bc742181ded4930842b46e9507372f0b1b963James Dong;// function exports and imports 650c1bc742181ded4930842b46e9507372f0b1b963James Dong 660c1bc742181ded4930842b46e9507372f0b1b963James Dong IMPORT h264bsdFillBlock 670c1bc742181ded4930842b46e9507372f0b1b963James Dong 680c1bc742181ded4930842b46e9507372f0b1b963James Dong EXPORT h264bsdInterpolateVerQuarter 690c1bc742181ded4930842b46e9507372f0b1b963James Dong 700c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Approach to vertical interpolation 710c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 720c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Interpolation is done by using 32-bit loads and stores 730c1bc742181ded4930842b46e9507372f0b1b963James Dong;// and by using 16 bit arithmetic. 4x4 block is processed 740c1bc742181ded4930842b46e9507372f0b1b963James Dong;// in each round. 750c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 760c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n| 770c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n| 780c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n| 790c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n| 800c1bc742181ded4930842b46e9507372f0b1b963James Dong;// .. 810c1bc742181ded4930842b46e9507372f0b1b963James Dong;// .. 820c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |a_m1|a_m1|a_m1|a_m1|... 830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |b_m1|b_m1|b_m1|b_m1|... 840c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |c_m1|c_m1|c_m1|c_m1|... 850c1bc742181ded4930842b46e9507372f0b1b963James Dong;// |d_m1|d_m1|d_m1|d_m1|... 860c1bc742181ded4930842b46e9507372f0b1b963James Dong 870c1bc742181ded4930842b46e9507372f0b1b963James Dongh264bsdInterpolateVerQuarter 880c1bc742181ded4930842b46e9507372f0b1b963James Dong STMFD sp!, {r0-r11, lr} 890c1bc742181ded4930842b46e9507372f0b1b963James Dong SUB sp, sp, #0x1e4 900c1bc742181ded4930842b46e9507372f0b1b963James Dong 910c1bc742181ded4930842b46e9507372f0b1b963James Dong CMP x0, #0 920c1bc742181ded4930842b46e9507372f0b1b963James Dong BLT do_fill ;// (x0 < 0) 930c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR partW, [sp,#0x220] ;// partWidth 940c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmp5, x0, partW ;// (x0+partWidth) 950c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR width, [sp,#0x218] ;// width 960c1bc742181ded4930842b46e9507372f0b1b963James Dong CMP tmp5, width 970c1bc742181ded4930842b46e9507372f0b1b963James Dong BHI do_fill ;// (x0+partW)>width 980c1bc742181ded4930842b46e9507372f0b1b963James Dong 990c1bc742181ded4930842b46e9507372f0b1b963James Dong CMP y0, #0 1000c1bc742181ded4930842b46e9507372f0b1b963James Dong BLT do_fill ;// (y0 < 0) 1010c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR partH, [sp,#0x224] ;// partHeight 1020c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmp6, y0, partH ;// (y0+partHeight) 1030c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmp6, tmp6, #5 ;// (y0+partH+5) 1040c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR height, [sp,#0x21c] ;// height 1050c1bc742181ded4930842b46e9507372f0b1b963James Dong CMP tmp6, height 1060c1bc742181ded4930842b46e9507372f0b1b963James Dong BLS skip_fill ;// no overfill needed 1070c1bc742181ded4930842b46e9507372f0b1b963James Dong 1080c1bc742181ded4930842b46e9507372f0b1b963James Dong 1090c1bc742181ded4930842b46e9507372f0b1b963James Dongdo_fill 1100c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR partH, [sp,#0x224] ;// partHeight 1110c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmp5, partH, #5 ;// r2 = partH + 5; 1120c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR height, [sp,#0x21c] ;// height 1130c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR partW, [sp,#0x220] ;// partWidth 1140c1bc742181ded4930842b46e9507372f0b1b963James Dong STMIB sp, {height, partW} ;// sp+4 = height, sp+8 = partWidth 1150c1bc742181ded4930842b46e9507372f0b1b963James Dong STR tmp5, [sp,#0xc] ;// sp+c partHeight+5 1160c1bc742181ded4930842b46e9507372f0b1b963James Dong STR partW, [sp,#0x10] ;// sp+10 = partWidth 1170c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR width, [sp,#0x218] ;// width 1180c1bc742181ded4930842b46e9507372f0b1b963James Dong STR width, [sp,#0] ;// sp+0 = width 1190c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1] 1200c1bc742181ded4930842b46e9507372f0b1b963James Dong BL h264bsdFillBlock 1210c1bc742181ded4930842b46e9507372f0b1b963James Dong 1220c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV x0, #0 1230c1bc742181ded4930842b46e9507372f0b1b963James Dong STR x0,[sp,#0x1ec] ;// x0 = 0 1240c1bc742181ded4930842b46e9507372f0b1b963James Dong STR x0,[sp,#0x1f0] ;// y0 = 0 1250c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD ref,sp,#0x28 ;// ref = p1 1260c1bc742181ded4930842b46e9507372f0b1b963James Dong STR partW, [sp,#0x218] 1270c1bc742181ded4930842b46e9507372f0b1b963James Dong 1280c1bc742181ded4930842b46e9507372f0b1b963James Dong 1290c1bc742181ded4930842b46e9507372f0b1b963James Dongskip_fill 1300c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR x0 ,[sp,#0x1ec] ;// x0 1310c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR y0 ,[sp,#0x1f0] ;// y0 1320c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR width, [sp,#0x218] ;// width 1330c1bc742181ded4930842b46e9507372f0b1b963James Dong MLA tmp6, width, y0, x0 ;// y0*width+x0 1340c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD ref, ref, tmp6 ;// ref += y0*width+x0 1350c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR mb, [sp, #0x1e8] ;// mb 1360c1bc742181ded4930842b46e9507372f0b1b963James Dong 1370c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD count, partW, partH, LSL #8 ;// |xx|xx|partH|partW| 1380c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmp5, = 0x00010100 1390c1bc742181ded4930842b46e9507372f0b1b963James Dong RSB count, tmp5, count, LSL #8 ;// |xx|partH-1|partW-1|xx| 1400c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmp2, [sp, #0x228] ;// verOffset 1410c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD count, count, tmp2 ;// |xx|partH-1|partW-1|verOffset| 1420c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR plus16, = 0x00100010 1430c1bc742181ded4930842b46e9507372f0b1b963James Dong 1440c1bc742181ded4930842b46e9507372f0b1b963James Dong AND tmp1, count, #0x0000FF00 ;// partWidth 1450c1bc742181ded4930842b46e9507372f0b1b963James Dong 1460c1bc742181ded4930842b46e9507372f0b1b963James Dong 1470c1bc742181ded4930842b46e9507372f0b1b963James Dongloop_y 1480c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD count, count, tmp1, LSL #16 ;// partWidth-1 to top byte 1490c1bc742181ded4930842b46e9507372f0b1b963James Dong 1500c1bc742181ded4930842b46e9507372f0b1b963James Dongloop_x 1510c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmp1, [ref], width ;// |a4|a3|a2|a1| 1520c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmp2, [ref], width ;// |c4|c3|c2|c1| 1530c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmp3, [ref], width ;// |g4|g3|g2|g1| 1540c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmp4, [ref], width ;// |m4|m3|m2|m1| 1550c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmp5, [ref], width ;// |r4|r3|r2|r1| 1560c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmp6, [ref], width ;// |t4|t3|t2|t1| 1570c1bc742181ded4930842b46e9507372f0b1b963James Dong 1580c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// first four pixels 1590c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpa, tmp3 ;// |g3|g1| 1600c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp4 ;// |g3+m3|g1+m1| 1610c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpb, tmp2 ;// |c3|c1| 1620c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 1630c1bc742181ded4930842b46e9507372f0b1b963James Dong 1640c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpb, tmpb, tmp5 ;// |c3+r3|c1+r1| 1650c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 1660c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A 1670c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp6 ;// 16+20(G+M)+A+T 1680c1bc742181ded4930842b46e9507372f0b1b963James Dong 1690c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 1700c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 1710c1bc742181ded4930842b46e9507372f0b1b963James Dong 1720c1bc742181ded4930842b46e9507372f0b1b963James Dong USAT16 tmpb, #13, tmpa ;// saturate 1730c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR res, = 0x00FF00FF 1740c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpa, tmp3, ROR #8 ;// |g4|g2| 1750c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// |g4+m4|g2+m2| 1760c1bc742181ded4930842b46e9507372f0b1b963James Dong AND res, res, tmpb, LSR #5 ;// mask and divide by 32 1770c1bc742181ded4930842b46e9507372f0b1b963James Dong 1780c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 1790c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpb, tmp2, ROR #8 ;// |c4|c2| 1800c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 1810c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpb, tmpb, tmp5, ROR #8 ;// |c4+r4|c2+r2| 1820c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A 1830c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// 16+20(G+M)+A+T 1840c1bc742181ded4930842b46e9507372f0b1b963James Dong 1850c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 1860c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 1870c1bc742181ded4930842b46e9507372f0b1b963James Dong 1880c1bc742181ded4930842b46e9507372f0b1b963James Dong USAT16 tmpb, #13, tmpa ;// saturate 1890c1bc742181ded4930842b46e9507372f0b1b963James Dong MOVS tmp1, count, LSL #31 ;// update flags (verOffset) 1900c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmpa, = 0xFF00FF00 1910c1bc742181ded4930842b46e9507372f0b1b963James Dong MVNEQ tmp1, tmp3 ;// select verOffset=0 1920c1bc742181ded4930842b46e9507372f0b1b963James Dong MVNNE tmp1, tmp4 ;// select verOffset=1 1930c1bc742181ded4930842b46e9507372f0b1b963James Dong AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divede by 32 1940c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR res, res, tmpa 1950c1bc742181ded4930842b46e9507372f0b1b963James Dong 1960c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmpa, = 0x80808080 1970c1bc742181ded4930842b46e9507372f0b1b963James Dong UHSUB8 res, res, tmp1 ;// bilinear interpolation 1980c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmp1, [ref], width ;// load next row 1990c1bc742181ded4930842b46e9507372f0b1b963James Dong EOR res, res, tmpa ;// correct sign 2000c1bc742181ded4930842b46e9507372f0b1b963James Dong 2010c1bc742181ded4930842b46e9507372f0b1b963James Dong STR res, [mb], #16 ;// next row (mb) 2020c1bc742181ded4930842b46e9507372f0b1b963James Dong 2030c1bc742181ded4930842b46e9507372f0b1b963James Dong 2040c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp2 = |a4|a3|a2|a1| 2050c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp3 = |c4|c3|c2|c1| 2060c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp4 = |g4|g3|g2|g1| 2070c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp5 = |m4|m3|m2|m1| 2080c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp6 = |r4|r3|r2|r1| 2090c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp1 = |t4|t3|t2|t1| 2100c1bc742181ded4930842b46e9507372f0b1b963James Dong 2110c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// second four pixels 2120c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpa, tmp4 ;// |g3|g1| 2130c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp5 ;// |g3+m3|g1+m1| 2140c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpb, tmp3 ;// |c3|c1| 2150c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 2160c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpb, tmpb, tmp6 ;// |c3+r3|c1+r1| 2170c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 2180c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A 2190c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A+T 2200c1bc742181ded4930842b46e9507372f0b1b963James Dong 2210c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 2220c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 2230c1bc742181ded4930842b46e9507372f0b1b963James Dong 2240c1bc742181ded4930842b46e9507372f0b1b963James Dong USAT16 tmpb, #13, tmpa ;// saturate 2250c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR res, = 0x00FF00FF 2260c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpa, tmp4, ROR #8 ;// |g4|g2| 2270c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp5, ROR #8 ;// |g4+m4|g2+m2| 2280c1bc742181ded4930842b46e9507372f0b1b963James Dong AND res, res, tmpb, LSR #5 ;// mask and divide by 32 2290c1bc742181ded4930842b46e9507372f0b1b963James Dong 2300c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 2310c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpb, tmp3, ROR #8 ;// |c4|c2| 2320c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 2330c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpb, tmpb, tmp6, ROR #8 ;// |c4+r4|c2+r2| 2340c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A 2350c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A+T 2360c1bc742181ded4930842b46e9507372f0b1b963James Dong 2370c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 2380c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 2390c1bc742181ded4930842b46e9507372f0b1b963James Dong 2400c1bc742181ded4930842b46e9507372f0b1b963James Dong USAT16 tmpb, #13, tmpa ;// saturate 2410c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmpa, = 0xFF00FF00 2420c1bc742181ded4930842b46e9507372f0b1b963James Dong MVNEQ tmp2, tmp4 ;// select verOffset=0 2430c1bc742181ded4930842b46e9507372f0b1b963James Dong MVNNE tmp2, tmp5 ;// select verOffset=1 2440c1bc742181ded4930842b46e9507372f0b1b963James Dong 2450c1bc742181ded4930842b46e9507372f0b1b963James Dong AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32 2460c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR res, res, tmpa 2470c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmpa, = 0x80808080 2480c1bc742181ded4930842b46e9507372f0b1b963James Dong UHSUB8 res, res, tmp2 ;// bilinear interpolation 2490c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmp2, [ref], width ;// load next row 2500c1bc742181ded4930842b46e9507372f0b1b963James Dong EOR res, res, tmpa ;// correct sign 2510c1bc742181ded4930842b46e9507372f0b1b963James Dong STR res, [mb], #16 ;// next row 2520c1bc742181ded4930842b46e9507372f0b1b963James Dong 2530c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp3 = |a4|a3|a2|a1| 2540c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp4 = |c4|c3|c2|c1| 2550c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp5 = |g4|g3|g2|g1| 2560c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp6 = |m4|m3|m2|m1| 2570c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp1 = |r4|r3|r2|r1| 2580c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp2 = |t4|t3|t2|t1| 2590c1bc742181ded4930842b46e9507372f0b1b963James Dong 2600c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// third four pixels 2610c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpa, tmp5 ;// |g3|g1| 2620c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp6 ;// |g3+m3|g1+m1| 2630c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpb, tmp4 ;// |c3|c1| 2640c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 2650c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpb, tmpb, tmp1 ;// |c3+r3|c1+r1| 2660c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 2670c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A 2680c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A+T 2690c1bc742181ded4930842b46e9507372f0b1b963James Dong 2700c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 2710c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 2720c1bc742181ded4930842b46e9507372f0b1b963James Dong 2730c1bc742181ded4930842b46e9507372f0b1b963James Dong USAT16 tmpb, #13, tmpa ;// saturate 2740c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR res, = 0x00FF00FF 2750c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpa, tmp5, ROR #8 ;// |g4|g2| 2760c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// |g4+m4|g2+m2| 2770c1bc742181ded4930842b46e9507372f0b1b963James Dong AND res, res, tmpb, LSR #5 ;// mask and divide by 32 2780c1bc742181ded4930842b46e9507372f0b1b963James Dong 2790c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 2800c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpb, tmp4, ROR #8 ;// |c4|c2| 2810c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 2820c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpb, tmpb, tmp1, ROR #8 ;// |c4+r4|c2+r2| 2830c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A 2840c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A+T 2850c1bc742181ded4930842b46e9507372f0b1b963James Dong 2860c1bc742181ded4930842b46e9507372f0b1b963James Dong 2870c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 2880c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 2890c1bc742181ded4930842b46e9507372f0b1b963James Dong 2900c1bc742181ded4930842b46e9507372f0b1b963James Dong USAT16 tmpb, #13, tmpa ;// saturate 2910c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmpa, = 0xFF00FF00 2920c1bc742181ded4930842b46e9507372f0b1b963James Dong MVNEQ tmp3, tmp5 ;// select verOffset=0 2930c1bc742181ded4930842b46e9507372f0b1b963James Dong MVNNE tmp3, tmp6 ;// select verOffset=1 2940c1bc742181ded4930842b46e9507372f0b1b963James Dong 2950c1bc742181ded4930842b46e9507372f0b1b963James Dong AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32 2960c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR res, res, tmpa 2970c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmpa, = 0x80808080 2980c1bc742181ded4930842b46e9507372f0b1b963James Dong UHSUB8 res, res, tmp3 ;// bilinear interpolation 2990c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmp3, [ref] ;// load next row 3000c1bc742181ded4930842b46e9507372f0b1b963James Dong EOR res, res, tmpa ;// correct sign 3010c1bc742181ded4930842b46e9507372f0b1b963James Dong STR res, [mb], #16 ;// next row 3020c1bc742181ded4930842b46e9507372f0b1b963James Dong 3030c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp4 = |a4|a3|a2|a1| 3040c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp5 = |c4|c3|c2|c1| 3050c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp6 = |g4|g3|g2|g1| 3060c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp1 = |m4|m3|m2|m1| 3070c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp2 = |r4|r3|r2|r1| 3080c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// tmp3 = |t4|t3|t2|t1| 3090c1bc742181ded4930842b46e9507372f0b1b963James Dong 3100c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// fourth four pixels 3110c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpa, tmp6 ;// |g3|g1| 3120c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp1 ;// |g3+m3|g1+m1| 3130c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpb, tmp5 ;// |c3|c1| 3140c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 3150c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpb, tmpb, tmp2 ;// |c3+r3|c1+r1| 3160c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 3170c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp4 ;// 16+20(G+M)+A 3180c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A+T 3190c1bc742181ded4930842b46e9507372f0b1b963James Dong 3200c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 3210c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 3220c1bc742181ded4930842b46e9507372f0b1b963James Dong 3230c1bc742181ded4930842b46e9507372f0b1b963James Dong USAT16 tmpb, #13, tmpa ;// saturate 3240c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR res, = 0x00FF00FF 3250c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpa, tmp6, ROR #8 ;// |g4|g2| 3260c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// |g4+m4|g2+m2| 3270c1bc742181ded4930842b46e9507372f0b1b963James Dong AND res, res, tmpb, LSR #5 ;// mask and divide by 32 3280c1bc742181ded4930842b46e9507372f0b1b963James Dong 3290c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 3300c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTB16 tmpb, tmp5, ROR #8 ;// |c4|c2| 3310c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 3320c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpb, tmpb, tmp2, ROR #8 ;// |c4+r4|c2+r2| 3330c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// 16+20(G+M)+A 3340c1bc742181ded4930842b46e9507372f0b1b963James Dong UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A+T 3350c1bc742181ded4930842b46e9507372f0b1b963James Dong 3360c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 3370c1bc742181ded4930842b46e9507372f0b1b963James Dong SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 3380c1bc742181ded4930842b46e9507372f0b1b963James Dong 3390c1bc742181ded4930842b46e9507372f0b1b963James Dong USAT16 tmpb, #13, tmpa ;// saturate 3400c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmp4, = 0xFF00FF00 3410c1bc742181ded4930842b46e9507372f0b1b963James Dong MVNEQ tmp5, tmp6 ;// select verOffset=0 3420c1bc742181ded4930842b46e9507372f0b1b963James Dong MVNNE tmp5, tmp1 ;// select verOffset=1 3430c1bc742181ded4930842b46e9507372f0b1b963James Dong 3440c1bc742181ded4930842b46e9507372f0b1b963James Dong AND tmpa, tmp4, tmpb, LSL #3 ;// mask and divide by 32 3450c1bc742181ded4930842b46e9507372f0b1b963James Dong ORR res, res, tmpa 3460c1bc742181ded4930842b46e9507372f0b1b963James Dong LDR tmpa, = 0x80808080 3470c1bc742181ded4930842b46e9507372f0b1b963James Dong UHSUB8 res, res, tmp5 ;// bilinear interpolation 3480c1bc742181ded4930842b46e9507372f0b1b963James Dong 3490c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// decrement loop_x counter 3500c1bc742181ded4930842b46e9507372f0b1b963James Dong SUBS count, count, #4<<24 ;// (partWidth-1) -= 4; 3510c1bc742181ded4930842b46e9507372f0b1b963James Dong 3520c1bc742181ded4930842b46e9507372f0b1b963James Dong ;// calculate "ref" address for next round 3530c1bc742181ded4930842b46e9507372f0b1b963James Dong SUB ref, ref, width, LSL #3 ;// ref -= 8*width; 3540c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD ref, ref, #4; ;// next column (4 pixels) 3550c1bc742181ded4930842b46e9507372f0b1b963James Dong 3560c1bc742181ded4930842b46e9507372f0b1b963James Dong EOR res, res, tmpa ;// correct sign 3570c1bc742181ded4930842b46e9507372f0b1b963James Dong STR res, [mb], #-44 3580c1bc742181ded4930842b46e9507372f0b1b963James Dong 3590c1bc742181ded4930842b46e9507372f0b1b963James Dong BCS loop_x 3600c1bc742181ded4930842b46e9507372f0b1b963James Dong 3610c1bc742181ded4930842b46e9507372f0b1b963James Dong ADDS count, count, #252<<16 ;// (partHeight-1) -= 4; 3620c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD ref, ref, width, LSL #2 ;// ref += 4*width 3630c1bc742181ded4930842b46e9507372f0b1b963James Dong AND tmp1, count, #0x0000FF00 ;// partWidth-1 3640c1bc742181ded4930842b46e9507372f0b1b963James Dong MOV tmp2, #1 3650c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD tmp2, tmp2, tmp1, LSR #8 ;// partWidth 3660c1bc742181ded4930842b46e9507372f0b1b963James Dong SUB ref, ref, tmp2 ;// ref -= partWidth 3670c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD mb, mb, #64; 3680c1bc742181ded4930842b46e9507372f0b1b963James Dong SUB mb, mb, tmp2; ;// mb -= partWidth 3690c1bc742181ded4930842b46e9507372f0b1b963James Dong BGE loop_y 3700c1bc742181ded4930842b46e9507372f0b1b963James Dong 3710c1bc742181ded4930842b46e9507372f0b1b963James Dong ADD sp,sp,#0x1f4 3720c1bc742181ded4930842b46e9507372f0b1b963James Dong LDMFD sp!, {r4-r11, pc} 3730c1bc742181ded4930842b46e9507372f0b1b963James Dong 3740c1bc742181ded4930842b46e9507372f0b1b963James Dong END 375