10c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
20c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
30c1bc742181ded4930842b46e9507372f0b1b963James Dong;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
40c1bc742181ded4930842b46e9507372f0b1b963James Dong;// OpenMAX DL: v1.0.2
50c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Revision:   9641
60c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Date:       Thursday, February 7, 2008
70c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
80c1bc742181ded4930842b46e9507372f0b1b963James Dong;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
90c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
100c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
110c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
120c1bc742181ded4930842b46e9507372f0b1b963James Dong
130c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE omxtypes_s.h
140c1bc742181ded4930842b46e9507372f0b1b963James Dong        INCLUDE armCOMM_s.h
150c1bc742181ded4930842b46e9507372f0b1b963James Dong
160c1bc742181ded4930842b46e9507372f0b1b963James Dong        EXPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
170c1bc742181ded4930842b46e9507372f0b1b963James Dong
180c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_VARIANTS ARM1136JS
190c1bc742181ded4930842b46e9507372f0b1b963James Dong
200c1bc742181ded4930842b46e9507372f0b1b963James Dong
210c1bc742181ded4930842b46e9507372f0b1b963James Dong
220c1bc742181ded4930842b46e9507372f0b1b963James Dong
230c1bc742181ded4930842b46e9507372f0b1b963James Dong
240c1bc742181ded4930842b46e9507372f0b1b963James Dong    IF ARM1136JS
250c1bc742181ded4930842b46e9507372f0b1b963James Dong
260c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ALLOC8 ppDstArgs, 8
270c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ALLOC4 ppSrc, 4
280c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ALLOC4 ppDst, 4
290c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ALLOC4 pCounter, 4
300c1bc742181ded4930842b46e9507372f0b1b963James Dong
310c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Function header
320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Function:
330c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//     armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
350c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Implements diagonal interpolation for a block of size 4x4. Input and output should
360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// be aligned.
370c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
380c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Registers used as input for this function
390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// r0,r1,r2,r3, r8 where r0,r2  input pointer and r1,r3 step size, r8 intermediate-buf pointer
400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
410c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Registers preserved for top level function
420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// r0,r1,r2,r3,r4,r5,r6,r14
430c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
440c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Registers modified by the function
450c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// r7,r8,r9,r10,r11,r12
460c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
470c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output registers
480c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// None. Function will preserve r0-r3
490c1bc742181ded4930842b46e9507372f0b1b963James Dong
500c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_START armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe, r6
510c1bc742181ded4930842b46e9507372f0b1b963James Dong
520c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Declare input registers
530c1bc742181ded4930842b46e9507372f0b1b963James DongpSrc            RN 0
540c1bc742181ded4930842b46e9507372f0b1b963James DongsrcStep         RN 1
550c1bc742181ded4930842b46e9507372f0b1b963James DongpDst            RN 2
560c1bc742181ded4930842b46e9507372f0b1b963James DongdstStep         RN 3
570c1bc742181ded4930842b46e9507372f0b1b963James Dong
580c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Declare inner loop registers
590c1bc742181ded4930842b46e9507372f0b1b963James DongValA            RN 5
600c1bc742181ded4930842b46e9507372f0b1b963James DongValA0           RN 4
610c1bc742181ded4930842b46e9507372f0b1b963James DongValA1           RN 5
620c1bc742181ded4930842b46e9507372f0b1b963James DongValAF0          RN 4
630c1bc742181ded4930842b46e9507372f0b1b963James DongValAF1          RN 5
640c1bc742181ded4930842b46e9507372f0b1b963James Dong
650c1bc742181ded4930842b46e9507372f0b1b963James DongValB            RN 11
660c1bc742181ded4930842b46e9507372f0b1b963James Dong
670c1bc742181ded4930842b46e9507372f0b1b963James DongValC            RN 5
680c1bc742181ded4930842b46e9507372f0b1b963James DongValC0           RN 4
690c1bc742181ded4930842b46e9507372f0b1b963James DongValC1           RN 5
700c1bc742181ded4930842b46e9507372f0b1b963James DongValCD0          RN 12
710c1bc742181ded4930842b46e9507372f0b1b963James DongValCD1          RN 14
720c1bc742181ded4930842b46e9507372f0b1b963James DongValCF0          RN 4
730c1bc742181ded4930842b46e9507372f0b1b963James DongValCF1          RN 5
740c1bc742181ded4930842b46e9507372f0b1b963James Dong
750c1bc742181ded4930842b46e9507372f0b1b963James DongValD            RN 10
760c1bc742181ded4930842b46e9507372f0b1b963James Dong
770c1bc742181ded4930842b46e9507372f0b1b963James DongValE            RN 7
780c1bc742181ded4930842b46e9507372f0b1b963James DongValE0           RN 6
790c1bc742181ded4930842b46e9507372f0b1b963James DongValE1           RN 7
800c1bc742181ded4930842b46e9507372f0b1b963James DongValEB0          RN 10
810c1bc742181ded4930842b46e9507372f0b1b963James DongValEB1          RN 11
820c1bc742181ded4930842b46e9507372f0b1b963James DongValED0          RN 6
830c1bc742181ded4930842b46e9507372f0b1b963James DongValED1          RN 7
840c1bc742181ded4930842b46e9507372f0b1b963James Dong
850c1bc742181ded4930842b46e9507372f0b1b963James DongValF            RN 10
860c1bc742181ded4930842b46e9507372f0b1b963James Dong
870c1bc742181ded4930842b46e9507372f0b1b963James DongValG            RN 14
880c1bc742181ded4930842b46e9507372f0b1b963James DongValG0           RN 12
890c1bc742181ded4930842b46e9507372f0b1b963James DongValG1           RN 14
900c1bc742181ded4930842b46e9507372f0b1b963James DongValGB0          RN 12
910c1bc742181ded4930842b46e9507372f0b1b963James DongValGB1          RN 14
920c1bc742181ded4930842b46e9507372f0b1b963James Dong
930c1bc742181ded4930842b46e9507372f0b1b963James DongAcc0            RN 4
940c1bc742181ded4930842b46e9507372f0b1b963James DongAcc1            RN 5
950c1bc742181ded4930842b46e9507372f0b1b963James DongAcc2            RN 6
960c1bc742181ded4930842b46e9507372f0b1b963James DongAcc3            RN 7
970c1bc742181ded4930842b46e9507372f0b1b963James Dong
980c1bc742181ded4930842b46e9507372f0b1b963James DongTemp            RN 7
990c1bc742181ded4930842b46e9507372f0b1b963James DongStep            RN 6
1000c1bc742181ded4930842b46e9507372f0b1b963James Dong
1010c1bc742181ded4930842b46e9507372f0b1b963James DongpInterBuf       RN 8
1020c1bc742181ded4930842b46e9507372f0b1b963James DongCounter         RN 8
1030c1bc742181ded4930842b46e9507372f0b1b963James Dongr0x00ff00ff     RN 9                                        ;// [0 255 0 255] where 255 is offset
1040c1bc742181ded4930842b46e9507372f0b1b963James Dongr0x0001fc00     RN 10                                       ;// [0 (16*255 - 16) 0 (16*255 - 16)]
1050c1bc742181ded4930842b46e9507372f0b1b963James Dong
1060c1bc742181ded4930842b46e9507372f0b1b963James Dong
1070c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Declare inner loop registers
1080c1bc742181ded4930842b46e9507372f0b1b963James DongValCA           RN 8
1090c1bc742181ded4930842b46e9507372f0b1b963James DongValDB           RN 9
1100c1bc742181ded4930842b46e9507372f0b1b963James DongValGE           RN 10
1110c1bc742181ded4930842b46e9507372f0b1b963James DongValHF           RN 11
1120c1bc742181ded4930842b46e9507372f0b1b963James Dongr0x00140001     RN 12
1130c1bc742181ded4930842b46e9507372f0b1b963James Dongr0x0014fffb     RN 14
1140c1bc742181ded4930842b46e9507372f0b1b963James Dong
1150c1bc742181ded4930842b46e9507372f0b1b963James Dongr0x00000200     RN 12
1160c1bc742181ded4930842b46e9507372f0b1b963James Dongr0x000000ff     RN 12
1170c1bc742181ded4930842b46e9507372f0b1b963James Dong
1180c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_STRD      pDst, dstStep, ppDstArgs
1190c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         pDst, pInterBuf
1200c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         dstStep, #24
1210c1bc742181ded4930842b46e9507372f0b1b963James Dong
1220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Set up counter of format, [0]  [0]  [1 (height)]  [8 (width)]
1230c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         Counter, #1
1240c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         Temp, #8
1250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         Counter, Temp, Counter, LSL #8        ;// [0 0 H W]
1260c1bc742181ded4930842b46e9507372f0b1b963James Dong
1270c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         r0x00ff00ff, =0x00ff00ff                ;// [0 255 0 255] 255 is offset to avoid negative results
1280c1bc742181ded4930842b46e9507372f0b1b963James DongWidthLoop
1290c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_STR       pSrc, ppSrc
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_STR       pDst, ppDst
1310c1bc742181ded4930842b46e9507372f0b1b963James DongHeightLoop
1320c1bc742181ded4930842b46e9507372f0b1b963James DongTwoRowsLoop
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR       ValC, [pSrc], srcStep                   ;// Load  [c3 c2 c1 c0]
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR       ValD, [pSrc], srcStep                   ;// Load  [d3 d2 d1 d0]
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR       ValE, [pSrc], srcStep                   ;// Load  [e3 e2 e1 e0]
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB         pSrc, pSrc, srcStep, LSL #2
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValC0, r0x00ff00ff, ValC                ;// [0 c2 0 c0] + [0 255 0 255]
1380c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValC1, r0x00ff00ff, ValC, ROR #8        ;// [0 c3 0 c1] + [0 255 0 255]
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValB, [pSrc]                            ;// Load  [b3 b2 b1 b0]
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValE0, r0x00ff00ff, ValE                ;// [0 e2 0 e0] + [0 255 0 255]
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValE1, r0x00ff00ff, ValE, ROR #8        ;// [0 e3 0 e1] + [0 255 0 255]
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValCD0, ValC0, ValD                     ;// [0 c2 0 c0] + [0 255 0 255] + [0 d2 0 d0]
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValCD1, ValC1, ValD, ROR #8             ;// [0 c3 0 c1] + [0 255 0 255] + [0 d3 0 d1]
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValEB0, ValE0, ValB                     ;// [0 e2 0 e0] + [0 255 0 255] + [0 b2 0 b0]
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong        RSB         ValCD0, ValEB0, ValCD0, LSL #2          ;// 4*(Off+C+D) - (Off+B+E)
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValD, [pSrc, srcStep, LSL #1]                       ;// Load  [d3 d2 d1 d0]
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValEB1, ValE1, ValB, ROR #8             ;// [0 e3 0 e1] + [0 255 0 255] + [0 b3 0 b1]
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong        RSB         ValCD1, ValEB1, ValCD1, LSL #2
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValED0, ValE0, ValD                     ;// [0 e2 0 e0] + [0 255 0 255] + [0 d2 0 d0]
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValED1, ValE1, ValD, ROR #8             ;// [0 e3 0 e1] + [0 255 0 255] + [0 d3 0 d1]
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValF, [pSrc, srcStep, LSL #2]           ;// Load  [f3 f2 f1 f0]
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR       ValB, [pSrc], srcStep                   ;// Load  [b3 b2 b1 b0]
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         ValCD0, ValCD0, ValCD0, LSL #2          ;// 5 * [4*(Off+C+D) - (Off+B+E)]
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         ValCD1, ValCD1, ValCD1, LSL #2
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValCF1, ValC1, ValF, ROR #8             ;// [0 c3 0 c1] + [0 255 0 255] + [0 f3 0 f1]
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValCF0, ValC0, ValF                     ;// [0 c2 0 c0] + [0 255 0 255] + [0 f2 0 f0]
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong        RSB         ValED1, ValCF1, ValED1, LSL #2
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB         ValA, pSrc, srcStep, LSL #1
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValA, [ValA]                            ;// Load  [a3 a2 a1 a0]
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong        RSB         ValED0, ValCF0, ValED0, LSL #2          ;// 4*(Off+E+D) - (Off+C+F)
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         ValED1, ValED1, ValED1, LSL #2
1650c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         ValED0, ValED0, ValED0, LSL #2          ;// 5 * [4*(Off+E+D) - (Off+C+F)]
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValA0, r0x00ff00ff, ValA                ;// [0 a2 0 a0] + [0 255 0 255]
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValA1, r0x00ff00ff, ValA, ROR #8        ;// [0 a3 0 a1] + [0 255 0 255]
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValAF0, ValA0, ValF                     ;// [0 a2 0 a0] + [0 255 0 255] + [0 f2 0 f0]
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValAF1, ValA1, ValF, ROR #8             ;// [0 a3 0 a1] + [0 255 0 255] + [0 f3 0 f1]
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         Acc1, ValCD1, ValAF1
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong
1720c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValG, [pSrc, srcStep, LSL #2]           ;// Load  [g3 g2 g1 g0]
1730c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         Acc0, ValCD0, ValAF0                    ;// Acc0 = 16*Off + (A+F) + 20*(C+D) - 5*(B+E)
1740c1bc742181ded4930842b46e9507372f0b1b963James Dong        STR         Acc1, [pDst, #4]                        ;// Store result & adjust pointer
1750c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_STR       Acc0, [pDst], dstStep                   ;// Store result & adjust pointer
1760c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValG0, r0x00ff00ff, ValG                ;// [0 g2 0 g0] + [0 255 0 255]
1770c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValG1, r0x00ff00ff, ValG, ROR #8        ;// [0 g3 0 g1] + [0 255 0 255]
1780c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValGB0, ValG0, ValB                     ;// [0 g2 0 g0] + [0 255 0 255] + [0 b2 0 b0]
1790c1bc742181ded4930842b46e9507372f0b1b963James Dong        UXTAB16     ValGB1, ValG1, ValB, ROR #8             ;// [0 g3 0 g1] + [0 255 0 255] + [0 b3 0 b1]
1800c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         Acc2, ValED0, ValGB0                    ;// Acc2 = 16*Off + (B+G) + 20*(D+E) - 5*(C+F)
1810c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         Acc3, ValED1, ValGB1
1820c1bc742181ded4930842b46e9507372f0b1b963James Dong
1830c1bc742181ded4930842b46e9507372f0b1b963James Dong        STR         Acc3, [pDst, #4]                        ;// Store result & adjust pointer
1840c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_STR       Acc2, [pDst], dstStep                   ;// Store result & adjust pointer
1850c1bc742181ded4930842b46e9507372f0b1b963James Dong
1860c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUBS        Counter, Counter, #1 << 8               ;// Loop till height is 10
1870c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         pSrc, pSrc, srcStep, LSL #1
1880c1bc742181ded4930842b46e9507372f0b1b963James Dong        BPL         HeightLoop
1890c1bc742181ded4930842b46e9507372f0b1b963James Dong
1900c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR       pSrc, ppSrc
1910c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR       pDst, ppDst
1920c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADDS        Counter, Counter, #(1 << 8)-4           ;// Loop till width is 12
1930c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         pSrc, pSrc, #4
1940c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         pDst, pDst, #8
1950c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD         Counter, Counter, #1<<8
1960c1bc742181ded4930842b46e9507372f0b1b963James Dong        BPL         WidthLoop
1970c1bc742181ded4930842b46e9507372f0b1b963James Dong
1980c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1990c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Horizontal interpolation using multiplication
2000c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
2010c1bc742181ded4930842b46e9507372f0b1b963James Dong
2020c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB         pSrc, pDst, #24
2030c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         srcStep, #24
2040c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDRD      pDst, dstStep, ppDstArgs
2050c1bc742181ded4930842b46e9507372f0b1b963James Dong
2060c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         Counter, #4
2070c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         r0x0014fffb, =0x0014fffb
2080c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         r0x00140001, =0x00140001
2090c1bc742181ded4930842b46e9507372f0b1b963James Dong
2100c1bc742181ded4930842b46e9507372f0b1b963James DongHeightLoop1
2110c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_STR       Counter, pCounter
2120c1bc742181ded4930842b46e9507372f0b1b963James Dong
2130c1bc742181ded4930842b46e9507372f0b1b963James Dong
2140c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValCA, [pSrc], #4                   ;// Load  [0 c 0 a]
2150c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValDB, [pSrc], #4                   ;// Load  [0 d 0 b]
2160c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValGE, [pSrc], #4                   ;// Load  [0 g 0 e]
2170c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         ValHF, [pSrc], #4                   ;// Load  [0 h 0 f]
2180c1bc742181ded4930842b46e9507372f0b1b963James Dong
2190c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Acc0 = smuad ([0 20 0 1], add([0 c 0 a] + [0 d 0 f])) - (5 * (b + e))
2200c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Acc1 = smuad ([0 20 0 1], add([0 e 0 g] + [0 d 0 b])) - (5 * (c + f))
2210c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Acc2 = smuad ([0 1 0 20], add([0 c 0 e] + [0 h 0 f])) - (5 * (d + g))
2220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Acc3 = smuad ([0 20 0 1], add([0 d 0 f] + [0 i 0 g])) - (5 * (e + h))
2230c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUAD       Acc0, ValCA, r0x00140001            ;// Acc0  = [0 c 0 a] * [0 20 0 1]
2240c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUAD       Acc1, ValDB, r0x00140001            ;// Acc1  = [0 c 0 a] * [0 20 0 1]
2250c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX      Acc2, ValGE, r0x0014fffb            ;// Acc2  = [0 g 0 e] * [0 20 0 -5]
2260c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUAD       Acc3, ValGE, r0x0014fffb            ;// Acc3  = [0 g 0 e] * [0 20 0 -5]
2270c1bc742181ded4930842b46e9507372f0b1b963James Dong
2280c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMLAD       Acc0, ValDB, r0x0014fffb, Acc0      ;// Acc0 += [0 d 0 b] * [0 20 0 -5]
2290c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMLADX      Acc1, ValGE, r0x00140001, Acc1      ;// Acc1 += [0 g 0 e] * [0 20 0 1]
2300c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMLADX      Acc2, ValHF, r0x00140001, Acc2      ;// Acc2 += [0 h 0 f] * [0 20 0 1]
2310c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMLADX      Acc3, ValHF, r0x0014fffb, Acc3      ;// Acc3 += [0 h 0 f] * [0 20 0 -5]
2320c1bc742181ded4930842b46e9507372f0b1b963James Dong
2330c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMLABB      Acc0, ValGE, r0x0014fffb, Acc0      ;// Acc0 += [0 g 0 e] * [0 0 0 -5]
2340c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMLATB      Acc1, ValCA, r0x0014fffb, Acc1      ;// Acc1 += [0 d 0 b] * [0 0 0 -5]
2350c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMLATB      Acc2, ValCA, r0x00140001, Acc2      ;// Acc2 += [0 c 0 a] * [0 0 0 1]
2360c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMLATB      Acc3, ValDB, r0x00140001, Acc3      ;// Acc3 += [0 c 0 a] * [0 0 0 1]
2370c1bc742181ded4930842b46e9507372f0b1b963James Dong
2380c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRH        ValCA, [pSrc], #8                   ;// 8 = srcStep - 16
2390c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMLABB      Acc0, ValHF, r0x00140001, Acc0      ;// Acc0 += [0 h 0 f] * [0 0 0 1]
2400c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMLABB      Acc1, ValHF, r0x0014fffb, Acc1      ;// Acc1 += [0 h 0 f] * [0 0 0 -5]
2410c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMLATB      Acc2, ValDB, r0x0014fffb, Acc2      ;// Acc2 += [0 d 0 b] * [0 0 0 -5]
2420c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMLABB      Acc3, ValCA, r0x00140001, Acc3      ;// Acc3 += [0 d 0 b] * [0 0 0 1]
2430c1bc742181ded4930842b46e9507372f0b1b963James Dong
2440c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         r0x0001fc00, =0x0001fc00            ;// (0xff * 16 * 32) - 512
2450c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB         Acc0, Acc0, r0x0001fc00
2460c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB         Acc1, Acc1, r0x0001fc00
2470c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB         Acc2, Acc2, r0x0001fc00
2480c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB         Acc3, Acc3, r0x0001fc00
2490c1bc742181ded4930842b46e9507372f0b1b963James Dong
2500c1bc742181ded4930842b46e9507372f0b1b963James Dong        USAT        Acc0, #18, Acc0
2510c1bc742181ded4930842b46e9507372f0b1b963James Dong        USAT        Acc1, #18, Acc1
2520c1bc742181ded4930842b46e9507372f0b1b963James Dong        USAT        Acc2, #18, Acc2
2530c1bc742181ded4930842b46e9507372f0b1b963James Dong        USAT        Acc3, #18, Acc3
2540c1bc742181ded4930842b46e9507372f0b1b963James Dong
2550c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         Acc0, Acc0, LSR #10
2560c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         Acc1, Acc1, LSR #10
2570c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         Acc2, Acc2, LSR #10
2580c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         Acc3, Acc3, LSR #10
2590c1bc742181ded4930842b46e9507372f0b1b963James Dong
2600c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR       Counter, pCounter
2610c1bc742181ded4930842b46e9507372f0b1b963James Dong        ORR         Acc0, Acc0, Acc1, LSL #8
2620c1bc742181ded4930842b46e9507372f0b1b963James Dong        ORR         Acc2, Acc2, Acc3, LSL #8
2630c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUBS        Counter, Counter, #1
2640c1bc742181ded4930842b46e9507372f0b1b963James Dong        ORR         Acc0, Acc0, Acc2, LSL #16
2650c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_STR       Acc0, [pDst], dstStep
2660c1bc742181ded4930842b46e9507372f0b1b963James Dong        BGT         HeightLoop1
2670c1bc742181ded4930842b46e9507372f0b1b963James DongEnd
2680c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB         pDst, pDst, dstStep, LSL #2
2690c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB         pSrc, pSrc, srcStep, LSL #2
2700c1bc742181ded4930842b46e9507372f0b1b963James Dong
2710c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_END
2720c1bc742181ded4930842b46e9507372f0b1b963James Dong
2730c1bc742181ded4930842b46e9507372f0b1b963James Dong    ENDIF
2740c1bc742181ded4930842b46e9507372f0b1b963James Dong
2750c1bc742181ded4930842b46e9507372f0b1b963James Dong    END
2760c1bc742181ded4930842b46e9507372f0b1b963James Dong
277