armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;// 2;// 3;// File Name: armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 9641 6;// Date: Thursday, February 7, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 EXPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 17 18 M_VARIANTS ARM1136JS 19 20 21 22 23 24 IF ARM1136JS 25 26 M_ALLOC8 ppDstArgs, 8 27 M_ALLOC4 ppSrc, 4 28 M_ALLOC4 ppDst, 4 29 M_ALLOC4 pCounter, 4 30 31 ;// Function header 32 ;// Function: 33 ;// armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 34 ;// 35 ;// Implements diagonal interpolation for a block of size 4x4. Input and output should 36 ;// be aligned. 37 ;// 38 ;// Registers used as input for this function 39 ;// r0,r1,r2,r3, r8 where r0,r2 input pointer and r1,r3 step size, r8 intermediate-buf pointer 40 ;// 41 ;// Registers preserved for top level function 42 ;// r0,r1,r2,r3,r4,r5,r6,r14 43 ;// 44 ;// Registers modified by the function 45 ;// r7,r8,r9,r10,r11,r12 46 ;// 47 ;// Output registers 48 ;// None. Function will preserve r0-r3 49 50 M_START armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe, r6 51 52;// Declare input registers 53pSrc RN 0 54srcStep RN 1 55pDst RN 2 56dstStep RN 3 57 58;// Declare inner loop registers 59ValA RN 5 60ValA0 RN 4 61ValA1 RN 5 62ValAF0 RN 4 63ValAF1 RN 5 64 65ValB RN 11 66 67ValC RN 5 68ValC0 RN 4 69ValC1 RN 5 70ValCD0 RN 12 71ValCD1 RN 14 72ValCF0 RN 4 73ValCF1 RN 5 74 75ValD RN 10 76 77ValE RN 7 78ValE0 RN 6 79ValE1 RN 7 80ValEB0 RN 10 81ValEB1 RN 11 82ValED0 RN 6 83ValED1 RN 7 84 85ValF RN 10 86 87ValG RN 14 88ValG0 RN 12 89ValG1 RN 14 90ValGB0 RN 12 91ValGB1 RN 14 92 93Acc0 RN 4 94Acc1 RN 5 95Acc2 RN 6 96Acc3 RN 7 97 98Temp RN 7 99Step RN 6 100 101pInterBuf RN 8 102Counter RN 8 103r0x00ff00ff RN 9 ;// [0 255 0 255] where 255 is offset 104r0x0001fc00 RN 10 ;// [0 (16*255 - 16) 0 (16*255 - 16)] 105 106 107;// Declare inner loop registers 108ValCA RN 8 109ValDB RN 9 110ValGE RN 10 111ValHF RN 11 112r0x00140001 RN 12 113r0x0014fffb RN 14 114 115r0x00000200 RN 12 116r0x000000ff RN 12 117 118 M_STRD pDst, dstStep, ppDstArgs 119 MOV pDst, pInterBuf 120 MOV dstStep, #24 121 122 ;// Set up counter of format, [0] [0] [1 (height)] [8 (width)] 123 MOV Counter, #1 124 MOV Temp, #8 125 ADD Counter, Temp, Counter, LSL #8 ;// [0 0 H W] 126 127 LDR r0x00ff00ff, =0x00ff00ff ;// [0 255 0 255] 255 is offset to avoid negative results 128WidthLoop 129 M_STR pSrc, ppSrc 130 M_STR pDst, ppDst 131HeightLoop 132TwoRowsLoop 133 M_LDR ValC, [pSrc], srcStep ;// Load [c3 c2 c1 c0] 134 M_LDR ValD, [pSrc], srcStep ;// Load [d3 d2 d1 d0] 135 M_LDR ValE, [pSrc], srcStep ;// Load [e3 e2 e1 e0] 136 SUB pSrc, pSrc, srcStep, LSL #2 137 UXTAB16 ValC0, r0x00ff00ff, ValC ;// [0 c2 0 c0] + [0 255 0 255] 138 UXTAB16 ValC1, r0x00ff00ff, ValC, ROR #8 ;// [0 c3 0 c1] + [0 255 0 255] 139 LDR ValB, [pSrc] ;// Load [b3 b2 b1 b0] 140 UXTAB16 ValE0, r0x00ff00ff, ValE ;// [0 e2 0 e0] + [0 255 0 255] 141 UXTAB16 ValE1, r0x00ff00ff, ValE, ROR #8 ;// [0 e3 0 e1] + [0 255 0 255] 142 UXTAB16 ValCD0, ValC0, ValD ;// [0 c2 0 c0] + [0 255 0 255] + [0 d2 0 d0] 143 UXTAB16 ValCD1, ValC1, ValD, ROR #8 ;// [0 c3 0 c1] + [0 255 0 255] + [0 d3 0 d1] 144 UXTAB16 ValEB0, ValE0, ValB ;// [0 e2 0 e0] + [0 255 0 255] + [0 b2 0 b0] 145 RSB ValCD0, ValEB0, ValCD0, LSL #2 ;// 4*(Off+C+D) - (Off+B+E) 146 147 LDR ValD, [pSrc, srcStep, LSL #1] ;// Load [d3 d2 d1 d0] 148 UXTAB16 ValEB1, ValE1, ValB, ROR #8 ;// [0 e3 0 e1] + [0 255 0 255] + [0 b3 0 b1] 149 RSB ValCD1, ValEB1, ValCD1, LSL #2 150 151 UXTAB16 ValED0, ValE0, ValD ;// [0 e2 0 e0] + [0 255 0 255] + [0 d2 0 d0] 152 UXTAB16 ValED1, ValE1, ValD, ROR #8 ;// [0 e3 0 e1] + [0 255 0 255] + [0 d3 0 d1] 153 LDR ValF, [pSrc, srcStep, LSL #2] ;// Load [f3 f2 f1 f0] 154 M_LDR ValB, [pSrc], srcStep ;// Load [b3 b2 b1 b0] 155 ADD ValCD0, ValCD0, ValCD0, LSL #2 ;// 5 * [4*(Off+C+D) - (Off+B+E)] 156 ADD ValCD1, ValCD1, ValCD1, LSL #2 157 UXTAB16 ValCF1, ValC1, ValF, ROR #8 ;// [0 c3 0 c1] + [0 255 0 255] + [0 f3 0 f1] 158 UXTAB16 ValCF0, ValC0, ValF ;// [0 c2 0 c0] + [0 255 0 255] + [0 f2 0 f0] 159 RSB ValED1, ValCF1, ValED1, LSL #2 160 161 SUB ValA, pSrc, srcStep, LSL #1 162 LDR ValA, [ValA] ;// Load [a3 a2 a1 a0] 163 RSB ValED0, ValCF0, ValED0, LSL #2 ;// 4*(Off+E+D) - (Off+C+F) 164 ADD ValED1, ValED1, ValED1, LSL #2 165 ADD ValED0, ValED0, ValED0, LSL #2 ;// 5 * [4*(Off+E+D) - (Off+C+F)] 166 UXTAB16 ValA0, r0x00ff00ff, ValA ;// [0 a2 0 a0] + [0 255 0 255] 167 UXTAB16 ValA1, r0x00ff00ff, ValA, ROR #8 ;// [0 a3 0 a1] + [0 255 0 255] 168 UXTAB16 ValAF0, ValA0, ValF ;// [0 a2 0 a0] + [0 255 0 255] + [0 f2 0 f0] 169 UXTAB16 ValAF1, ValA1, ValF, ROR #8 ;// [0 a3 0 a1] + [0 255 0 255] + [0 f3 0 f1] 170 ADD Acc1, ValCD1, ValAF1 171 172 LDR ValG, [pSrc, srcStep, LSL #2] ;// Load [g3 g2 g1 g0] 173 ADD Acc0, ValCD0, ValAF0 ;// Acc0 = 16*Off + (A+F) + 20*(C+D) - 5*(B+E) 174 STR Acc1, [pDst, #4] ;// Store result & adjust pointer 175 M_STR Acc0, [pDst], dstStep ;// Store result & adjust pointer 176 UXTAB16 ValG0, r0x00ff00ff, ValG ;// [0 g2 0 g0] + [0 255 0 255] 177 UXTAB16 ValG1, r0x00ff00ff, ValG, ROR #8 ;// [0 g3 0 g1] + [0 255 0 255] 178 UXTAB16 ValGB0, ValG0, ValB ;// [0 g2 0 g0] + [0 255 0 255] + [0 b2 0 b0] 179 UXTAB16 ValGB1, ValG1, ValB, ROR #8 ;// [0 g3 0 g1] + [0 255 0 255] + [0 b3 0 b1] 180 ADD Acc2, ValED0, ValGB0 ;// Acc2 = 16*Off + (B+G) + 20*(D+E) - 5*(C+F) 181 ADD Acc3, ValED1, ValGB1 182 183 STR Acc3, [pDst, #4] ;// Store result & adjust pointer 184 M_STR Acc2, [pDst], dstStep ;// Store result & adjust pointer 185 186 SUBS Counter, Counter, #1 << 8 ;// Loop till height is 10 187 ADD pSrc, pSrc, srcStep, LSL #1 188 BPL HeightLoop 189 190 M_LDR pSrc, ppSrc 191 M_LDR pDst, ppDst 192 ADDS Counter, Counter, #(1 << 8)-4 ;// Loop till width is 12 193 ADD pSrc, pSrc, #4 194 ADD pDst, pDst, #8 195 ADD Counter, Counter, #1<<8 196 BPL WidthLoop 197 198 ;// 199 ;// Horizontal interpolation using multiplication 200 ;// 201 202 SUB pSrc, pDst, #24 203 MOV srcStep, #24 204 M_LDRD pDst, dstStep, ppDstArgs 205 206 MOV Counter, #4 207 LDR r0x0014fffb, =0x0014fffb 208 LDR r0x00140001, =0x00140001 209 210HeightLoop1 211 M_STR Counter, pCounter 212 213 214 LDR ValCA, [pSrc], #4 ;// Load [0 c 0 a] 215 LDR ValDB, [pSrc], #4 ;// Load [0 d 0 b] 216 LDR ValGE, [pSrc], #4 ;// Load [0 g 0 e] 217 LDR ValHF, [pSrc], #4 ;// Load [0 h 0 f] 218 219 ;// Acc0 = smuad ([0 20 0 1], add([0 c 0 a] + [0 d 0 f])) - (5 * (b + e)) 220 ;// Acc1 = smuad ([0 20 0 1], add([0 e 0 g] + [0 d 0 b])) - (5 * (c + f)) 221 ;// Acc2 = smuad ([0 1 0 20], add([0 c 0 e] + [0 h 0 f])) - (5 * (d + g)) 222 ;// Acc3 = smuad ([0 20 0 1], add([0 d 0 f] + [0 i 0 g])) - (5 * (e + h)) 223 SMUAD Acc0, ValCA, r0x00140001 ;// Acc0 = [0 c 0 a] * [0 20 0 1] 224 SMUAD Acc1, ValDB, r0x00140001 ;// Acc1 = [0 c 0 a] * [0 20 0 1] 225 SMUADX Acc2, ValGE, r0x0014fffb ;// Acc2 = [0 g 0 e] * [0 20 0 -5] 226 SMUAD Acc3, ValGE, r0x0014fffb ;// Acc3 = [0 g 0 e] * [0 20 0 -5] 227 228 SMLAD Acc0, ValDB, r0x0014fffb, Acc0 ;// Acc0 += [0 d 0 b] * [0 20 0 -5] 229 SMLADX Acc1, ValGE, r0x00140001, Acc1 ;// Acc1 += [0 g 0 e] * [0 20 0 1] 230 SMLADX Acc2, ValHF, r0x00140001, Acc2 ;// Acc2 += [0 h 0 f] * [0 20 0 1] 231 SMLADX Acc3, ValHF, r0x0014fffb, Acc3 ;// Acc3 += [0 h 0 f] * [0 20 0 -5] 232 233 SMLABB Acc0, ValGE, r0x0014fffb, Acc0 ;// Acc0 += [0 g 0 e] * [0 0 0 -5] 234 SMLATB Acc1, ValCA, r0x0014fffb, Acc1 ;// Acc1 += [0 d 0 b] * [0 0 0 -5] 235 SMLATB Acc2, ValCA, r0x00140001, Acc2 ;// Acc2 += [0 c 0 a] * [0 0 0 1] 236 SMLATB Acc3, ValDB, r0x00140001, Acc3 ;// Acc3 += [0 c 0 a] * [0 0 0 1] 237 238 LDRH ValCA, [pSrc], #8 ;// 8 = srcStep - 16 239 SMLABB Acc0, ValHF, r0x00140001, Acc0 ;// Acc0 += [0 h 0 f] * [0 0 0 1] 240 SMLABB Acc1, ValHF, r0x0014fffb, Acc1 ;// Acc1 += [0 h 0 f] * [0 0 0 -5] 241 SMLATB Acc2, ValDB, r0x0014fffb, Acc2 ;// Acc2 += [0 d 0 b] * [0 0 0 -5] 242 SMLABB Acc3, ValCA, r0x00140001, Acc3 ;// Acc3 += [0 d 0 b] * [0 0 0 1] 243 244 LDR r0x0001fc00, =0x0001fc00 ;// (0xff * 16 * 32) - 512 245 SUB Acc0, Acc0, r0x0001fc00 246 SUB Acc1, Acc1, r0x0001fc00 247 SUB Acc2, Acc2, r0x0001fc00 248 SUB Acc3, Acc3, r0x0001fc00 249 250 USAT Acc0, #18, Acc0 251 USAT Acc1, #18, Acc1 252 USAT Acc2, #18, Acc2 253 USAT Acc3, #18, Acc3 254 255 MOV Acc0, Acc0, LSR #10 256 MOV Acc1, Acc1, LSR #10 257 MOV Acc2, Acc2, LSR #10 258 MOV Acc3, Acc3, LSR #10 259 260 M_LDR Counter, pCounter 261 ORR Acc0, Acc0, Acc1, LSL #8 262 ORR Acc2, Acc2, Acc3, LSL #8 263 SUBS Counter, Counter, #1 264 ORR Acc0, Acc0, Acc2, LSL #16 265 M_STR Acc0, [pDst], dstStep 266 BGT HeightLoop1 267End 268 SUB pDst, pDst, dstStep, LSL #2 269 SUB pSrc, pSrc, srcStep, LSL #2 270 271 M_END 272 273 ENDIF 274 275 END 276 277