armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
10c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
20c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
30c1bc742181ded4930842b46e9507372f0b1b963James Dong;// File Name:  armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
40c1bc742181ded4930842b46e9507372f0b1b963James Dong;// OpenMAX DL: v1.0.2
50c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Revision:   9641
60c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Date:       Thursday, February 7, 2008
70c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
80c1bc742181ded4930842b46e9507372f0b1b963James Dong;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
90c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
100c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
110c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
120c1bc742181ded4930842b46e9507372f0b1b963James Dong
130c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE omxtypes_s.h
140c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE armCOMM_s.h
150c1bc742181ded4930842b46e9507372f0b1b963James Dong
160c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_VARIANTS ARM1136JS
170c1bc742181ded4930842b46e9507372f0b1b963James Dong
180c1bc742181ded4930842b46e9507372f0b1b963James Dong        EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
190c1bc742181ded4930842b46e9507372f0b1b963James Dong
200c1bc742181ded4930842b46e9507372f0b1b963James Dong
210c1bc742181ded4930842b46e9507372f0b1b963James Dong
220c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF ARM1136JS
230c1bc742181ded4930842b46e9507372f0b1b963James Dong
240c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Function header
250c1bc742181ded4930842b46e9507372f0b1b963James Dong
260c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Function:
270c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//     armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
280c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
290c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Implements vertical interpolation for a block of size 4x4. Input and output should
300c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// be aligned.
310c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Registers used as input for this function
330c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// r0,r1,r2,r3 where r0,r2  input pointer and r1,r3 corresponding step size
340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
350c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Registers preserved for top level function
360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// r0,r1,r2,r3,r4,r5,r6,r14
370c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
380c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Registers modified by the function
390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// r7,r8,r9,r10,r11,r12
400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
410c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output registers
420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// None. Function will preserve r0-r3
430c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r6
440c1bc742181ded4930842b46e9507372f0b1b963James Dong
450c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Declare input registers
460c1bc742181ded4930842b46e9507372f0b1b963James DongpSrc            RN 0
470c1bc742181ded4930842b46e9507372f0b1b963James DongsrcStep         RN 1
480c1bc742181ded4930842b46e9507372f0b1b963James DongpDst            RN 2
490c1bc742181ded4930842b46e9507372f0b1b963James DongdstStep         RN 3
500c1bc742181ded4930842b46e9507372f0b1b963James Dong
510c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Declare inner loop registers
520c1bc742181ded4930842b46e9507372f0b1b963James DongValA            RN 5
530c1bc742181ded4930842b46e9507372f0b1b963James DongValA0           RN 4
540c1bc742181ded4930842b46e9507372f0b1b963James DongValA1           RN 5
550c1bc742181ded4930842b46e9507372f0b1b963James DongValAF0          RN 4
560c1bc742181ded4930842b46e9507372f0b1b963James DongValAF1          RN 5
570c1bc742181ded4930842b46e9507372f0b1b963James Dong
580c1bc742181ded4930842b46e9507372f0b1b963James DongValB            RN 11
590c1bc742181ded4930842b46e9507372f0b1b963James Dong
600c1bc742181ded4930842b46e9507372f0b1b963James DongValC            RN 5
610c1bc742181ded4930842b46e9507372f0b1b963James DongValC0           RN 4
620c1bc742181ded4930842b46e9507372f0b1b963James DongValC1           RN 5
630c1bc742181ded4930842b46e9507372f0b1b963James DongValCD0          RN 12
640c1bc742181ded4930842b46e9507372f0b1b963James DongValCD1          RN 14
650c1bc742181ded4930842b46e9507372f0b1b963James DongValCF0          RN 4
660c1bc742181ded4930842b46e9507372f0b1b963James DongValCF1          RN 5
670c1bc742181ded4930842b46e9507372f0b1b963James Dong
680c1bc742181ded4930842b46e9507372f0b1b963James DongValD            RN 10
690c1bc742181ded4930842b46e9507372f0b1b963James Dong
700c1bc742181ded4930842b46e9507372f0b1b963James DongValE            RN 7
710c1bc742181ded4930842b46e9507372f0b1b963James DongValE0           RN 6
720c1bc742181ded4930842b46e9507372f0b1b963James DongValE1           RN 7
730c1bc742181ded4930842b46e9507372f0b1b963James DongValEB0          RN 10
740c1bc742181ded4930842b46e9507372f0b1b963James DongValEB1          RN 11
750c1bc742181ded4930842b46e9507372f0b1b963James DongValED0          RN 6
760c1bc742181ded4930842b46e9507372f0b1b963James DongValED1          RN 7
770c1bc742181ded4930842b46e9507372f0b1b963James Dong
780c1bc742181ded4930842b46e9507372f0b1b963James DongValF            RN 10
790c1bc742181ded4930842b46e9507372f0b1b963James Dong
800c1bc742181ded4930842b46e9507372f0b1b963James DongValG            RN 14
810c1bc742181ded4930842b46e9507372f0b1b963James DongValG0           RN 12
820c1bc742181ded4930842b46e9507372f0b1b963James DongValG1           RN 14
830c1bc742181ded4930842b46e9507372f0b1b963James DongValGB0          RN 12
840c1bc742181ded4930842b46e9507372f0b1b963James DongValGB1          RN 14
850c1bc742181ded4930842b46e9507372f0b1b963James Dong
860c1bc742181ded4930842b46e9507372f0b1b963James DongAcc0            RN 4
870c1bc742181ded4930842b46e9507372f0b1b963James DongAcc1            RN 5
880c1bc742181ded4930842b46e9507372f0b1b963James DongAcc2            RN 6
890c1bc742181ded4930842b46e9507372f0b1b963James DongAcc3            RN 7
900c1bc742181ded4930842b46e9507372f0b1b963James Dong
910c1bc742181ded4930842b46e9507372f0b1b963James DongTemp            RN 7
920c1bc742181ded4930842b46e9507372f0b1b963James DongHeight          RN 3
930c1bc742181ded4930842b46e9507372f0b1b963James DongStep            RN 6
940c1bc742181ded4930842b46e9507372f0b1b963James Dong
950c1bc742181ded4930842b46e9507372f0b1b963James DongCounter         RN 8
960c1bc742181ded4930842b46e9507372f0b1b963James Dongr0x00ff00ff     RN 9                                        ;// [0 255 0 255] where 255 is offset
970c1bc742181ded4930842b46e9507372f0b1b963James Dongr0x0fe00fe0     RN 10                                       ;// [0 (16*255 - 16) 0 (16*255 - 16)]
980c1bc742181ded4930842b46e9507372f0b1b963James Dong
990c1bc742181ded4930842b46e9507372f0b1b963James Dong
1000c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         r0x00ff00ff, =0x00ff00ff                ;// [0 255 0 255] 255 is offset to avoid negative results
1010c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         Counter, #2
1020c1bc742181ded4930842b46e9507372f0b1b963James Dong
1030c1bc742181ded4930842b46e9507372f0b1b963James DongTwoRowsLoop
1040c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR       ValC, [pSrc], srcStep                   ;// Load  [c3 c2 c1 c0]
1050c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR       ValD, [pSrc], srcStep                   ;// Load  [d3 d2 d1 d0]
1060c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR       ValE, [pSrc], srcStep                   ;// Load  [e3 e2 e1 e0]
1070c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB         pSrc, pSrc, srcStep, LSL #2
1080c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValB, [pSrc]                            ;// Load  [b3 b2 b1 b0]
1090c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValC0, r0x00ff00ff, ValC                ;// [0 c2 0 c0] + [0 255 0 255]
1100c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValC1, r0x00ff00ff, ValC, ROR #8        ;// [0 c3 0 c1] + [0 255 0 255]
1110c1bc742181ded4930842b46e9507372f0b1b963James Dong
1120c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValE0, r0x00ff00ff, ValE                ;// [0 e2 0 e0] + [0 255 0 255]
1130c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValE1, r0x00ff00ff, ValE, ROR #8        ;// [0 e3 0 e1] + [0 255 0 255]
1140c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValCD0, ValC0, ValD                     ;// [0 c2 0 c0] + [0 255 0 255] + [0 d2 0 d0]
1150c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValCD1, ValC1, ValD, ROR #8             ;// [0 c3 0 c1] + [0 255 0 255] + [0 d3 0 d1]
1160c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValEB0, ValE0, ValB                     ;// [0 e2 0 e0] + [0 255 0 255] + [0 b2 0 b0]
1170c1bc742181ded4930842b46e9507372f0b1b963James Dong        RSB         ValCD0, ValEB0, ValCD0, LSL #2          ;// 4*(Off+C+D) - (Off+B+E)
1180c1bc742181ded4930842b46e9507372f0b1b963James Dong
1190c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValD, [pSrc, srcStep, LSL #1]                       ;// Load  [d3 d2 d1 d0]
1200c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValEB1, ValE1, ValB, ROR #8             ;// [0 e3 0 e1] + [0 255 0 255] + [0 b3 0 b1]
1210c1bc742181ded4930842b46e9507372f0b1b963James Dong        RSB         ValCD1, ValEB1, ValCD1, LSL #2
1220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// One cycle stall
1230c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValED0, ValE0, ValD                     ;// [0 e2 0 e0] + [0 255 0 255] + [0 d2 0 d0]
1240c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValED1, ValE1, ValD, ROR #8             ;// [0 e3 0 e1] + [0 255 0 255] + [0 d3 0 d1]
1250c1bc742181ded4930842b46e9507372f0b1b963James Dong
1260c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValF, [pSrc, srcStep, LSL #2]           ;// Load  [f3 f2 f1 f0]
1270c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR       ValB, [pSrc], srcStep                   ;// Load  [b3 b2 b1 b0]
1280c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         ValCD0, ValCD0, ValCD0, LSL #2          ;// 5 * [4*(Off+C+D) - (Off+B+E)]
1290c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         ValCD1, ValCD1, ValCD1, LSL #2
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValCF1, ValC1, ValF, ROR #8             ;// [0 c3 0 c1] + [0 255 0 255] + [0 f3 0 f1]
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValCF0, ValC0, ValF                     ;// [0 c2 0 c0] + [0 255 0 255] + [0 f2 0 f0]
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong        RSB         ValED1, ValCF1, ValED1, LSL #2
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB         ValA, pSrc, srcStep, LSL #1
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValA, [ValA]                            ;// Load  [a3 a2 a1 a0]
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong        RSB         ValED0, ValCF0, ValED0, LSL #2          ;// 4*(Off+E+D) - (Off+C+F)
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         ValED1, ValED1, ValED1, LSL #2
1380c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         ValED0, ValED0, ValED0, LSL #2          ;// 5 * [4*(Off+E+D) - (Off+C+F)]
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValA0, r0x00ff00ff, ValA                ;// [0 a2 0 a0] + [0 255 0 255]
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValA1, r0x00ff00ff, ValA, ROR #8        ;// [0 a3 0 a1] + [0 255 0 255]
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValAF0, ValA0, ValF                     ;// [0 a2 0 a0] + [0 255 0 255] + [0 f2 0 f0]
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValAF1, ValA1, ValF, ROR #8             ;// [0 a3 0 a1] + [0 255 0 255] + [0 f3 0 f1]
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         r0x0fe00fe0, =0x0fe00fe0                ;// [0 255 0 255] 255 is offset to avoid negative results
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         Acc1, ValCD1, ValAF1
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValG, [pSrc, srcStep, LSL #2]           ;// Load  [g3 g2 g1 g0]
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         Acc0, ValCD0, ValAF0                    ;// Acc0 = 16*Off + (A+F) + 20*(C+D) - 5*(B+E)
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong        UQSUB16     Acc1, Acc1, r0x0fe00fe0                 ;// Acc1 -= (16*Off - 16)
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong        UQSUB16     Acc0, Acc0, r0x0fe00fe0
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValG0, r0x00ff00ff, ValG                ;// [0 g2 0 g0] + [0 255 0 255]
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValG1, r0x00ff00ff, ValG, ROR #8        ;// [0 g3 0 g1] + [0 255 0 255]
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValGB0, ValG0, ValB                     ;// [0 g2 0 g0] + [0 255 0 255] + [0 b2 0 b0]
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValGB1, ValG1, ValB, ROR #8             ;// [0 g3 0 g1] + [0 255 0 255] + [0 b3 0 b1]
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         Acc2, ValED0, ValGB0                    ;// Acc2 = 16*Off + (B+G) + 20*(D+E) - 5*(C+F)
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         Acc3, ValED1, ValGB1
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong        UQSUB16     Acc3, Acc3, r0x0fe00fe0                 ;// Acc3 -= (16*Off - 16)
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong        UQSUB16     Acc2, Acc2, r0x0fe00fe0
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong        USAT16      Acc1, #13, Acc1                         ;// Saturate to 8+5 = 13 bits
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong        USAT16      Acc0, #13, Acc0
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong        USAT16      Acc3, #13, Acc3
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong        USAT16      Acc2, #13, Acc2
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND         Acc1, r0x00ff00ff, Acc1, LSR #5         ;// [0 a3 0 a1]
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND         Acc0, r0x00ff00ff, Acc0, LSR #5         ;// [0 a2 0 a0]
1650c1bc742181ded4930842b46e9507372f0b1b963James Dong        ORR         Acc0, Acc0, Acc1, LSL #8                ;// [a3 a2 a1 a0]
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND         Acc3, r0x00ff00ff, Acc3, LSR #5         ;// [0 b3 0 b1]
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong        AND         Acc2, r0x00ff00ff, Acc2, LSR #5         ;// [0 b2 0 b0]
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_STR       Acc0, [pDst], dstStep                   ;// Store result & adjust pointer
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ORR         Acc2, Acc2, Acc3, LSL #8                ;// [b3 b2 b1 b0]
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_STR       Acc2, [pDst], dstStep                   ;// Store result & adjust pointer
1720c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         pSrc, pSrc, srcStep, LSL #1
1730c1bc742181ded4930842b46e9507372f0b1b963James Dong
1740c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUBS        Counter, Counter, #1
1750c1bc742181ded4930842b46e9507372f0b1b963James Dong        BGT         TwoRowsLoop
1760c1bc742181ded4930842b46e9507372f0b1b963James DongEnd
1770c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB     pDst, pDst, dstStep, LSL #2
1780c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB     pSrc, pSrc, srcStep, LSL #2
1790c1bc742181ded4930842b46e9507372f0b1b963James Dong
1800c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_END
1810c1bc742181ded4930842b46e9507372f0b1b963James Dong
1820c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF
1830c1bc742181ded4930842b46e9507372f0b1b963James Dong
1840c1bc742181ded4930842b46e9507372f0b1b963James Dong    END
1850c1bc742181ded4930842b46e9507372f0b1b963James Dong