armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;// 2;// 3;// File Name: armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 9641 6;// Date: Thursday, February 7, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 17 18 M_VARIANTS ARM1136JS 19 20 21 22 IF ARM1136JS 23 24 25 M_ALLOC8 ppDstArgs, 8 26 M_ALLOC8 pTempResult1, 8 27 M_ALLOC8 pTempResult2, 8 28 M_ALLOC4 ppSrc, 4 29 M_ALLOC4 ppDst, 4 30 M_ALLOC4 pDstStep, 4 31 M_ALLOC4 pSrcStep, 4 32 M_ALLOC4 pCounter, 4 33 34 ;// Function header 35 ;// Function: 36 ;// armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 37 ;// 38 ;// Implements diagonal interpolation for a block of size 4x4. Input and output should 39 ;// be aligned. 40 ;// 41 ;// Registers used as input for this function 42 ;// r0,r1,r2,r3, r8 where r0,r2 input pointer and r1,r3 step size, r8 intermediate-buf pointer 43 ;// 44 ;// Registers preserved for top level function 45 ;// r0,r1,r2,r3,r4,r5,r6,r14 46 ;// 47 ;// Registers modified by the function 48 ;// r7,r8,r9,r10,r11,r12 49 ;// 50 ;// Output registers 51 ;// None. Function will preserve r0-r3 52 53 M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r6 54 55;// Declare input registers 56pSrc RN 0 57srcStep RN 1 58pDst RN 2 59dstStep RN 3 60 61;// Declare inner loop registers 62Acc0 RN 4 63Acc1 RN 5 64Acc2 RN 6 65Acc3 RN 7 66 67ValA RN 4 68ValB RN 5 69ValC RN 6 70ValD RN 7 71ValE RN 8 72ValF RN 9 73ValG RN 12 74ValH RN 14 75ValI RN 1 76 77Temp1 RN 3 78Temp2 RN 1 79Temp3 RN 12 80Temp4 RN 7 81Temp5 RN 5 82r0x0fe00fe0 RN 3 ;// [0 (16*255 - 16) 0 (16*255 - 16)] 83r0x00ff00ff RN 10 ;// [0 255 0 255] where 255 is offset 84Counter RN 11 85pInterBuf RN 8 86 87ValCA RN 8 88ValDB RN 9 89ValGE RN 10 90ValHF RN 11 91r0x00140001 RN 12 92r0x0014fffb RN 14 93 94r0x0001fc00 RN 11 95 96Accx RN 8 97Accy RN 9 98Temp6 RN 14 99 100 M_STRD pDst, dstStep, ppDstArgs 101 102 MOV pDst, pInterBuf 103 MOV dstStep, #16 104 105 ;// Set up counter of format, [0] [0] [1 (height)] [8 (width)] 106 MOV Counter, #4 107 M_STR dstStep, pDstStep 108 M_STR srcStep, pSrcStep 109 LDR r0x00ff00ff, =0x00ff00ff ;// [0 255 0 255] 255 is offset to avoid negative results 110 111HeightLoop 112NextTwoRowsLoop 113 LDR ValD, [pSrc, srcStep] ;// Load row 1 [d1 c1 b1 a1] 114 LDR ValA, [pSrc], #4 ;// Load row 0 [d0 c0 b0 a0] 115 LDR ValH, [pSrc, srcStep] ;// Load [h1 g1 f1 e1] 116 LDR ValE, [pSrc], #4 ;// Load [h0 g0 f0 e0] 117 LDRB Temp2, [pSrc, srcStep] ;// Load row 1 [l1 k1 j1 i1] 118 LDRB Temp1, [pSrc], #-8 ;// Load row 0 [l0 k0 j0 i0] 119 120 PKHBT ValB, ValA, ValD, LSL #16 ;// [b1 a1 b0 a0] 121 PKHTB ValD, ValD, ValA, ASR #16 ;// [d1 c1 d0 c0] 122 UXTAB16 ValA, r0x00ff00ff, ValB ;// [00 a1 00 a0] + [0 255 0 255] 123 UXTAB16 ValC, r0x00ff00ff, ValD ;// [00 c1 00 c0] + [0 255 0 255] 124 PKHBT ValI, Temp1, Temp2, LSL #16 ;// [00 i1 00 i0] 125 PKHBT ValF, ValE, ValH, LSL #16 ;// [f1 e1 f0 e0] 126 PKHTB ValH, ValH, ValE, ASR #16 ;// [h1 g1 h0 g0] 127 UXTAB16 ValE, r0x00ff00ff, ValF ;// [00 e1 00 e0] + [0 255 0 255] 128 129 ;// Calculate Acc0 130 ;// Acc0 = a - 5*b + 20*c + 20*d - 5*e + f 131 UXTAB16 Temp1, ValC, ValD, ROR #8 132 UXTAB16 Temp3, ValE, ValB, ROR #8 133 RSB Temp1, Temp3, Temp1, LSL #2 134 UXTAB16 Acc0, ValA, ValF, ROR #8 135 ADD Temp1, Temp1, Temp1, LSL #2 136 ADD Acc0, Acc0, Temp1 137 138 ;// Calculate Acc1 139 ;// Acc1 = b - 5*c + 20*d + 20*e - 5*f + g 140 UXTAB16 Temp1, ValE, ValD, ROR #8 141 UXTAB16 Temp3, ValC, ValF, ROR #8 142 RSB Temp1, Temp3, Temp1, LSL #2 143 UXTAB16 ValG, r0x00ff00ff, ValH ;// [00 g1 00 g0] + [0 255 0 255] 144 ADD Temp1, Temp1, Temp1, LSL #2 145 UXTAB16 Acc1, ValG, ValB, ROR #8 146 ADD Acc1, Acc1, Temp1 147 148 UXTAB16 Acc2, ValC, ValH, ROR #8 149 ADD ValI, r0x00ff00ff, ValI ;// [00 i1 00 i0] + [0 255 0 255] 150 151 ;// Calculate Acc2 152 ;// Acc2 = c - 5*d + 20*e + 20*f - 5*g + h 153 UXTAB16 Temp1, ValG, ValD, ROR #8 154 UXTAB16 Acc3, ValI, ValD, ROR #8 155 UXTAB16 Temp2, ValE, ValF, ROR #8 156 157 RSB Temp1, Temp1, Temp2, LSL #2 158 UXTAB16 Temp2, ValG, ValF, ROR #8 159 ADD Temp1, Temp1, Temp1, LSL #2 160 ADD Acc2, Acc2, Temp1 161 162 ;// Calculate Acc3 163 ;// Acc3 = d - 5*e + 20*f + 20*g - 5*h + i 164 UXTAB16 Temp1, ValE, ValH, ROR #8 165 RSB Temp1, Temp1, Temp2, LSL #2 166 ADD Temp1, Temp1, Temp1, LSL #2 167 ADD Acc3, Acc3, Temp1 168 169 M_LDR dstStep, pDstStep 170 M_LDR srcStep, pSrcStep 171 172 ;// If Counter is even store Acc0-Acc3 in a temporary buffer 173 ;// If Counter is off store Acc0-Acc3 and previous Acc0-Acc3 in a intermediate buf 174 ANDS Temp3, Counter, #1 175 BEQ NoProcessing 176 177 ;// Packing previous and current Acc0-Acc3 values 178 M_LDRD Accx, Accy, pTempResult1 179 PKHBT Temp6, Accx, Acc0, LSL #16 ;//[0 a2 0 a0] = [0 a3 0 a2] [0 a1 0 a0] 180 PKHTB Acc0, Acc0, Accx, ASR #16 ;//[0 a3 0 a1] = [0 a1 0 a0] [0 a3 0 a2] 181 STR Acc0, [pDst, dstStep] 182 STR Temp6, [pDst], #4 183 PKHBT Temp6, Accy, Acc1, LSL #16 ;//[0 b2 0 b0] = [0 b3 0 b2] [0 b1 0 b0] 184 PKHTB Acc1, Acc1, Accy, ASR #16 ;//[0 b3 0 b1] = [0 b1 0 b0] [0 b3 0 b2] 185 M_LDRD Accx, Accy, pTempResult2 186 STR Acc1, [pDst, dstStep] 187 STR Temp6, [pDst], #4 188 189 PKHBT Temp6, Accx, Acc2, LSL #16 ;//[0 c2 0 c0] = [0 c3 0 c2] [0 c1 0 c0] 190 PKHTB Acc2, Acc2, Accx, ASR #16 ;//[0 c3 0 c1] = [0 c1 0 c0] [0 c3 0 c2] 191 STR Acc2, [pDst, dstStep] 192 STR Temp6, [pDst], #4 193 PKHBT Temp6, Accy, Acc3, LSL #16 ;//[0 d2 0 d0] = [0 d3 0 d2] [0 d1 0 d0] 194 PKHTB Acc3, Acc3, Accy, ASR #16 ;//[0 d3 0 d1] = [0 d1 0 d0] [0 d3 0 d2] 195 STR Acc3, [pDst, dstStep] 196 STR Temp6, [pDst], #-12 197 ADD pDst, pDst, dstStep, LSL #1 198 B AfterStore 199 200NoProcessing 201 M_STRD Acc0, Acc1, pTempResult1 202 M_STRD Acc2, Acc3, pTempResult2 203AfterStore 204 SUBS Counter, Counter, #1 ;// Loop till height is 10 205 ADD pSrc, pSrc, srcStep, LSL #1 206 BPL HeightLoop 207 208 STR Acc0, [pDst], #4 ;//[0 a1 0 a0] 209 STR Acc1, [pDst], #4 210 STR Acc2, [pDst], #4 211 STR Acc3, [pDst], #-12 212 213 ;// 214 ;// Horizontal interpolation using multiplication 215 ;// 216 217 SUB pSrc, pDst, dstStep, LSL #2 218 MOV srcStep, #16 219 M_LDRD pDst, dstStep, ppDstArgs 220 221 MOV Counter, #4 222 LDR r0x0014fffb, =0x0014fffb 223 LDR r0x00140001, =0x00140001 224 225HeightLoop1 226 M_STR Counter, pCounter 227 228 M_LDR ValCA, [pSrc], srcStep ;// Load [0 c 0 a] 229 M_LDR ValDB, [pSrc], srcStep ;// Load [0 d 0 b] 230 M_LDR ValGE, [pSrc], srcStep ;// Load [0 g 0 e] 231 M_LDR ValHF, [pSrc], srcStep ;// Load [0 h 0 f] 232 233 234 ;// Acc0 = smuad ([0 20 0 1], add([0 c 0 a] + [0 d 0 f])) - (5 * (b + e)) 235 ;// Acc1 = smuad ([0 20 0 1], add([0 e 0 g] + [0 d 0 b])) - (5 * (c + f)) 236 ;// Acc2 = smuad ([0 1 0 20], add([0 c 0 e] + [0 h 0 f])) - (5 * (d + g)) 237 ;// Acc3 = smuad ([0 20 0 1], add([0 d 0 f] + [0 i 0 g])) - (5 * (e + h)) 238 239 SMUAD Acc0, ValCA, r0x00140001 ;// Acc0 = [0 c 0 a] * [0 20 0 1] 240 SMUAD Acc1, ValDB, r0x00140001 ;// Acc1 = [0 c 0 a] * [0 20 0 1] 241 SMUADX Acc2, ValGE, r0x0014fffb ;// Acc2 = [0 g 0 e] * [0 20 0 -5] 242 SMUAD Acc3, ValGE, r0x0014fffb ;// Acc3 = [0 g 0 e] * [0 20 0 -5] 243 244 SMLAD Acc0, ValDB, r0x0014fffb, Acc0 ;// Acc0 += [0 d 0 b] * [0 20 0 -5] 245 SMLADX Acc1, ValGE, r0x00140001, Acc1 ;// Acc1 += [0 g 0 e] * [0 20 0 1] 246 SMLADX Acc2, ValHF, r0x00140001, Acc2 ;// Acc2 += [0 h 0 f] * [0 20 0 1] 247 SMLADX Acc3, ValHF, r0x0014fffb, Acc3 ;// Acc3 += [0 h 0 f] * [0 20 0 -5] 248 249 SMLABB Acc0, ValGE, r0x0014fffb, Acc0 ;// Acc0 += [0 g 0 e] * [0 0 0 -5] 250 SMLATB Acc1, ValCA, r0x0014fffb, Acc1 ;// Acc1 += [0 d 0 b] * [0 0 0 -5] 251 SMLATB Acc2, ValCA, r0x00140001, Acc2 ;// Acc2 += [0 c 0 a] * [0 0 0 1] 252 SMLATB Acc3, ValDB, r0x00140001, Acc3 ;// Acc3 += [0 c 0 a] * [0 0 0 1] 253 254 LDRH ValCA, [pSrc], #4 ;// 8 = srcStep - 16 255 SMLABB Acc0, ValHF, r0x00140001, Acc0 ;// Acc0 += [0 h 0 f] * [0 0 0 1] 256 SMLABB Acc1, ValHF, r0x0014fffb, Acc1 ;// Acc1 += [0 h 0 f] * [0 0 0 -5] 257 SMLATB Acc2, ValDB, r0x0014fffb, Acc2 ;// Acc2 += [0 d 0 b] * [0 0 0 -5] 258 SMLABB Acc3, ValCA, r0x00140001, Acc3 ;// Acc3 += [0 d 0 b] * [0 0 0 1] 259 260 LDR r0x0001fc00, =0x0001fc00 ;// (0xff * 16 * 32) - 512 261 SUB Acc0, Acc0, r0x0001fc00 262 SUB Acc1, Acc1, r0x0001fc00 263 SUB Acc2, Acc2, r0x0001fc00 264 SUB Acc3, Acc3, r0x0001fc00 265 266 USAT Acc0, #18, Acc0 267 USAT Acc1, #18, Acc1 268 USAT Acc2, #18, Acc2 269 USAT Acc3, #18, Acc3 270 271 MOV Acc0, Acc0, LSR #10 272 M_STRB Acc0, [pDst], dstStep 273 MOV Acc1, Acc1, LSR #10 274 M_STRB Acc1, [pDst], dstStep 275 MOV Acc2, Acc2, LSR #10 276 M_STRB Acc2, [pDst], dstStep 277 MOV Acc3, Acc3, LSR #10 278 M_STRB Acc3, [pDst], dstStep 279 280 281 M_LDR Counter, pCounter 282 SUB pDst, pDst, dstStep, LSL #2 283 SUB pSrc, pSrc, srcStep, LSL #2 284 ADD pDst, pDst, #1 285 SUBS Counter, Counter, #1 286 BGT HeightLoop1 287End 288 SUB pDst, pDst, #4 289 SUB pSrc, pSrc, #16 290 291 M_END 292 293 ENDIF 294 295 END 296 297