1;// 2;// 3;// File Name: armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 9641 6;// Date: Thursday, February 7, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS ARM1136JS 17 18 EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 19 20 21 22 IF ARM1136JS 23 24 ;// Function header 25 26 ;// Function: 27 ;// armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 28 ;// 29 ;// Implements vertical interpolation for a block of size 4x4. Input and output should 30 ;// be aligned. 31 ;// 32 ;// Registers used as input for this function 33 ;// r0,r1,r2,r3 where r0,r2 input pointer and r1,r3 corresponding step size 34 ;// 35 ;// Registers preserved for top level function 36 ;// r0,r1,r2,r3,r4,r5,r6,r14 37 ;// 38 ;// Registers modified by the function 39 ;// r7,r8,r9,r10,r11,r12 40 ;// 41 ;// Output registers 42 ;// None. Function will preserve r0-r3 43 M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r6 44 45;// Declare input registers 46pSrc RN 0 47srcStep RN 1 48pDst RN 2 49dstStep RN 3 50 51;// Declare inner loop registers 52ValA RN 5 53ValA0 RN 4 54ValA1 RN 5 55ValAF0 RN 4 56ValAF1 RN 5 57 58ValB RN 11 59 60ValC RN 5 61ValC0 RN 4 62ValC1 RN 5 63ValCD0 RN 12 64ValCD1 RN 14 65ValCF0 RN 4 66ValCF1 RN 5 67 68ValD RN 10 69 70ValE RN 7 71ValE0 RN 6 72ValE1 RN 7 73ValEB0 RN 10 74ValEB1 RN 11 75ValED0 RN 6 76ValED1 RN 7 77 78ValF RN 10 79 80ValG RN 14 81ValG0 RN 12 82ValG1 RN 14 83ValGB0 RN 12 84ValGB1 RN 14 85 86Acc0 RN 4 87Acc1 RN 5 88Acc2 RN 6 89Acc3 RN 7 90 91Temp RN 7 92Height RN 3 93Step RN 6 94 95Counter RN 8 96r0x00ff00ff RN 9 ;// [0 255 0 255] where 255 is offset 97r0x0fe00fe0 RN 10 ;// [0 (16*255 - 16) 0 (16*255 - 16)] 98 99 100 LDR r0x00ff00ff, =0x00ff00ff ;// [0 255 0 255] 255 is offset to avoid negative results 101 MOV Counter, #2 102 103TwoRowsLoop 104 M_LDR ValC, [pSrc], srcStep ;// Load [c3 c2 c1 c0] 105 M_LDR ValD, [pSrc], srcStep ;// Load [d3 d2 d1 d0] 106 M_LDR ValE, [pSrc], srcStep ;// Load [e3 e2 e1 e0] 107 SUB pSrc, pSrc, srcStep, LSL #2 108 LDR ValB, [pSrc] ;// Load [b3 b2 b1 b0] 109 UXTAB16 ValC0, r0x00ff00ff, ValC ;// [0 c2 0 c0] + [0 255 0 255] 110 UXTAB16 ValC1, r0x00ff00ff, ValC, ROR #8 ;// [0 c3 0 c1] + [0 255 0 255] 111 112 UXTAB16 ValE0, r0x00ff00ff, ValE ;// [0 e2 0 e0] + [0 255 0 255] 113 UXTAB16 ValE1, r0x00ff00ff, ValE, ROR #8 ;// [0 e3 0 e1] + [0 255 0 255] 114 UXTAB16 ValCD0, ValC0, ValD ;// [0 c2 0 c0] + [0 255 0 255] + [0 d2 0 d0] 115 UXTAB16 ValCD1, ValC1, ValD, ROR #8 ;// [0 c3 0 c1] + [0 255 0 255] + [0 d3 0 d1] 116 UXTAB16 ValEB0, ValE0, ValB ;// [0 e2 0 e0] + [0 255 0 255] + [0 b2 0 b0] 117 RSB ValCD0, ValEB0, ValCD0, LSL #2 ;// 4*(Off+C+D) - (Off+B+E) 118 119 LDR ValD, [pSrc, srcStep, LSL #1] ;// Load [d3 d2 d1 d0] 120 UXTAB16 ValEB1, ValE1, ValB, ROR #8 ;// [0 e3 0 e1] + [0 255 0 255] + [0 b3 0 b1] 121 RSB ValCD1, ValEB1, ValCD1, LSL #2 122 ;// One cycle stall 123 UXTAB16 ValED0, ValE0, ValD ;// [0 e2 0 e0] + [0 255 0 255] + [0 d2 0 d0] 124 UXTAB16 ValED1, ValE1, ValD, ROR #8 ;// [0 e3 0 e1] + [0 255 0 255] + [0 d3 0 d1] 125 126 LDR ValF, [pSrc, srcStep, LSL #2] ;// Load [f3 f2 f1 f0] 127 M_LDR ValB, [pSrc], srcStep ;// Load [b3 b2 b1 b0] 128 ADD ValCD0, ValCD0, ValCD0, LSL #2 ;// 5 * [4*(Off+C+D) - (Off+B+E)] 129 ADD ValCD1, ValCD1, ValCD1, LSL #2 130 UXTAB16 ValCF1, ValC1, ValF, ROR #8 ;// [0 c3 0 c1] + [0 255 0 255] + [0 f3 0 f1] 131 UXTAB16 ValCF0, ValC0, ValF ;// [0 c2 0 c0] + [0 255 0 255] + [0 f2 0 f0] 132 RSB ValED1, ValCF1, ValED1, LSL #2 133 134 SUB ValA, pSrc, srcStep, LSL #1 135 LDR ValA, [ValA] ;// Load [a3 a2 a1 a0] 136 RSB ValED0, ValCF0, ValED0, LSL #2 ;// 4*(Off+E+D) - (Off+C+F) 137 ADD ValED1, ValED1, ValED1, LSL #2 138 ADD ValED0, ValED0, ValED0, LSL #2 ;// 5 * [4*(Off+E+D) - (Off+C+F)] 139 UXTAB16 ValA0, r0x00ff00ff, ValA ;// [0 a2 0 a0] + [0 255 0 255] 140 UXTAB16 ValA1, r0x00ff00ff, ValA, ROR #8 ;// [0 a3 0 a1] + [0 255 0 255] 141 UXTAB16 ValAF0, ValA0, ValF ;// [0 a2 0 a0] + [0 255 0 255] + [0 f2 0 f0] 142 UXTAB16 ValAF1, ValA1, ValF, ROR #8 ;// [0 a3 0 a1] + [0 255 0 255] + [0 f3 0 f1] 143 144 LDR r0x0fe00fe0, =0x0fe00fe0 ;// [0 255 0 255] 255 is offset to avoid negative results 145 ADD Acc1, ValCD1, ValAF1 146 147 LDR ValG, [pSrc, srcStep, LSL #2] ;// Load [g3 g2 g1 g0] 148 ADD Acc0, ValCD0, ValAF0 ;// Acc0 = 16*Off + (A+F) + 20*(C+D) - 5*(B+E) 149 UQSUB16 Acc1, Acc1, r0x0fe00fe0 ;// Acc1 -= (16*Off - 16) 150 UQSUB16 Acc0, Acc0, r0x0fe00fe0 151 UXTAB16 ValG0, r0x00ff00ff, ValG ;// [0 g2 0 g0] + [0 255 0 255] 152 UXTAB16 ValG1, r0x00ff00ff, ValG, ROR #8 ;// [0 g3 0 g1] + [0 255 0 255] 153 UXTAB16 ValGB0, ValG0, ValB ;// [0 g2 0 g0] + [0 255 0 255] + [0 b2 0 b0] 154 UXTAB16 ValGB1, ValG1, ValB, ROR #8 ;// [0 g3 0 g1] + [0 255 0 255] + [0 b3 0 b1] 155 ADD Acc2, ValED0, ValGB0 ;// Acc2 = 16*Off + (B+G) + 20*(D+E) - 5*(C+F) 156 ADD Acc3, ValED1, ValGB1 157 UQSUB16 Acc3, Acc3, r0x0fe00fe0 ;// Acc3 -= (16*Off - 16) 158 UQSUB16 Acc2, Acc2, r0x0fe00fe0 159 USAT16 Acc1, #13, Acc1 ;// Saturate to 8+5 = 13 bits 160 USAT16 Acc0, #13, Acc0 161 USAT16 Acc3, #13, Acc3 162 USAT16 Acc2, #13, Acc2 163 AND Acc1, r0x00ff00ff, Acc1, LSR #5 ;// [0 a3 0 a1] 164 AND Acc0, r0x00ff00ff, Acc0, LSR #5 ;// [0 a2 0 a0] 165 ORR Acc0, Acc0, Acc1, LSL #8 ;// [a3 a2 a1 a0] 166 AND Acc3, r0x00ff00ff, Acc3, LSR #5 ;// [0 b3 0 b1] 167 AND Acc2, r0x00ff00ff, Acc2, LSR #5 ;// [0 b2 0 b0] 168 169 M_STR Acc0, [pDst], dstStep ;// Store result & adjust pointer 170 ORR Acc2, Acc2, Acc3, LSL #8 ;// [b3 b2 b1 b0] 171 M_STR Acc2, [pDst], dstStep ;// Store result & adjust pointer 172 ADD pSrc, pSrc, srcStep, LSL #1 173 174 SUBS Counter, Counter, #1 175 BGT TwoRowsLoop 176End 177 SUB pDst, pDst, dstStep, LSL #2 178 SUB pSrc, pSrc, srcStep, LSL #2 179 180 M_END 181 182 ENDIF 183 184 END 185