1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 9641 21;// Date: Thursday, February 7, 2008 22;// 23;// 24;// 25;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 31 32 M_VARIANTS ARM1136JS 33 34 35 36 IF ARM1136JS 37 38 39 M_ALLOC8 ppDstArgs, 8 40 M_ALLOC8 pTempResult1, 8 41 M_ALLOC8 pTempResult2, 8 42 M_ALLOC4 ppSrc, 4 43 M_ALLOC4 ppDst, 4 44 M_ALLOC4 pDstStep, 4 45 M_ALLOC4 pSrcStep, 4 46 M_ALLOC4 pCounter, 4 47 48 ;// Function header 49 ;// Function: 50 ;// armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 51 ;// 52 ;// Implements diagonal interpolation for a block of size 4x4. Input and output should 53 ;// be aligned. 54 ;// 55 ;// Registers used as input for this function 56 ;// r0,r1,r2,r3, r8 where r0,r2 input pointer and r1,r3 step size, r8 intermediate-buf pointer 57 ;// 58 ;// Registers preserved for top level function 59 ;// r0,r1,r2,r3,r4,r5,r6,r14 60 ;// 61 ;// Registers modified by the function 62 ;// r7,r8,r9,r10,r11,r12 63 ;// 64 ;// Output registers 65 ;// None. Function will preserve r0-r3 66 67 M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r6 68 69;// Declare input registers 70pSrc RN 0 71srcStep RN 1 72pDst RN 2 73dstStep RN 3 74 75;// Declare inner loop registers 76Acc0 RN 4 77Acc1 RN 5 78Acc2 RN 6 79Acc3 RN 7 80 81ValA RN 4 82ValB RN 5 83ValC RN 6 84ValD RN 7 85ValE RN 8 86ValF RN 9 87ValG RN 12 88ValH RN 14 89ValI RN 1 90 91Temp1 RN 3 92Temp2 RN 1 93Temp3 RN 12 94Temp4 RN 7 95Temp5 RN 5 96r0x0fe00fe0 RN 3 ;// [0 (16*255 - 16) 0 (16*255 - 16)] 97r0x00ff00ff RN 10 ;// [0 255 0 255] where 255 is offset 98Counter RN 11 99pInterBuf RN 8 100 101ValCA RN 8 102ValDB RN 9 103ValGE RN 10 104ValHF RN 11 105r0x00140001 RN 12 106r0x0014fffb RN 14 107 108r0x0001fc00 RN 11 109 110Accx RN 8 111Accy RN 9 112Temp6 RN 14 113 114 M_STRD pDst, dstStep, ppDstArgs 115 116 MOV pDst, pInterBuf 117 MOV dstStep, #16 118 119 ;// Set up counter of format, [0] [0] [1 (height)] [8 (width)] 120 MOV Counter, #4 121 M_STR dstStep, pDstStep 122 M_STR srcStep, pSrcStep 123 LDR r0x00ff00ff, =0x00ff00ff ;// [0 255 0 255] 255 is offset to avoid negative results 124 125HeightLoop 126NextTwoRowsLoop 127 LDR ValD, [pSrc, srcStep] ;// Load row 1 [d1 c1 b1 a1] 128 LDR ValA, [pSrc], #4 ;// Load row 0 [d0 c0 b0 a0] 129 LDR ValH, [pSrc, srcStep] ;// Load [h1 g1 f1 e1] 130 LDR ValE, [pSrc], #4 ;// Load [h0 g0 f0 e0] 131 LDRB Temp2, [pSrc, srcStep] ;// Load row 1 [l1 k1 j1 i1] 132 LDRB Temp1, [pSrc], #-8 ;// Load row 0 [l0 k0 j0 i0] 133 134 PKHBT ValB, ValA, ValD, LSL #16 ;// [b1 a1 b0 a0] 135 PKHTB ValD, ValD, ValA, ASR #16 ;// [d1 c1 d0 c0] 136 UXTAB16 ValA, r0x00ff00ff, ValB ;// [00 a1 00 a0] + [0 255 0 255] 137 UXTAB16 ValC, r0x00ff00ff, ValD ;// [00 c1 00 c0] + [0 255 0 255] 138 PKHBT ValI, Temp1, Temp2, LSL #16 ;// [00 i1 00 i0] 139 PKHBT ValF, ValE, ValH, LSL #16 ;// [f1 e1 f0 e0] 140 PKHTB ValH, ValH, ValE, ASR #16 ;// [h1 g1 h0 g0] 141 UXTAB16 ValE, r0x00ff00ff, ValF ;// [00 e1 00 e0] + [0 255 0 255] 142 143 ;// Calculate Acc0 144 ;// Acc0 = a - 5*b + 20*c + 20*d - 5*e + f 145 UXTAB16 Temp1, ValC, ValD, ROR #8 146 UXTAB16 Temp3, ValE, ValB, ROR #8 147 RSB Temp1, Temp3, Temp1, LSL #2 148 UXTAB16 Acc0, ValA, ValF, ROR #8 149 ADD Temp1, Temp1, Temp1, LSL #2 150 ADD Acc0, Acc0, Temp1 151 152 ;// Calculate Acc1 153 ;// Acc1 = b - 5*c + 20*d + 20*e - 5*f + g 154 UXTAB16 Temp1, ValE, ValD, ROR #8 155 UXTAB16 Temp3, ValC, ValF, ROR #8 156 RSB Temp1, Temp3, Temp1, LSL #2 157 UXTAB16 ValG, r0x00ff00ff, ValH ;// [00 g1 00 g0] + [0 255 0 255] 158 ADD Temp1, Temp1, Temp1, LSL #2 159 UXTAB16 Acc1, ValG, ValB, ROR #8 160 ADD Acc1, Acc1, Temp1 161 162 UXTAB16 Acc2, ValC, ValH, ROR #8 163 ADD ValI, r0x00ff00ff, ValI ;// [00 i1 00 i0] + [0 255 0 255] 164 165 ;// Calculate Acc2 166 ;// Acc2 = c - 5*d + 20*e + 20*f - 5*g + h 167 UXTAB16 Temp1, ValG, ValD, ROR #8 168 UXTAB16 Acc3, ValI, ValD, ROR #8 169 UXTAB16 Temp2, ValE, ValF, ROR #8 170 171 RSB Temp1, Temp1, Temp2, LSL #2 172 UXTAB16 Temp2, ValG, ValF, ROR #8 173 ADD Temp1, Temp1, Temp1, LSL #2 174 ADD Acc2, Acc2, Temp1 175 176 ;// Calculate Acc3 177 ;// Acc3 = d - 5*e + 20*f + 20*g - 5*h + i 178 UXTAB16 Temp1, ValE, ValH, ROR #8 179 RSB Temp1, Temp1, Temp2, LSL #2 180 ADD Temp1, Temp1, Temp1, LSL #2 181 ADD Acc3, Acc3, Temp1 182 183 M_LDR dstStep, pDstStep 184 M_LDR srcStep, pSrcStep 185 186 ;// If Counter is even store Acc0-Acc3 in a temporary buffer 187 ;// If Counter is off store Acc0-Acc3 and previous Acc0-Acc3 in a intermediate buf 188 ANDS Temp3, Counter, #1 189 BEQ NoProcessing 190 191 ;// Packing previous and current Acc0-Acc3 values 192 M_LDRD Accx, Accy, pTempResult1 193 PKHBT Temp6, Accx, Acc0, LSL #16 ;//[0 a2 0 a0] = [0 a3 0 a2] [0 a1 0 a0] 194 PKHTB Acc0, Acc0, Accx, ASR #16 ;//[0 a3 0 a1] = [0 a1 0 a0] [0 a3 0 a2] 195 STR Acc0, [pDst, dstStep] 196 STR Temp6, [pDst], #4 197 PKHBT Temp6, Accy, Acc1, LSL #16 ;//[0 b2 0 b0] = [0 b3 0 b2] [0 b1 0 b0] 198 PKHTB Acc1, Acc1, Accy, ASR #16 ;//[0 b3 0 b1] = [0 b1 0 b0] [0 b3 0 b2] 199 M_LDRD Accx, Accy, pTempResult2 200 STR Acc1, [pDst, dstStep] 201 STR Temp6, [pDst], #4 202 203 PKHBT Temp6, Accx, Acc2, LSL #16 ;//[0 c2 0 c0] = [0 c3 0 c2] [0 c1 0 c0] 204 PKHTB Acc2, Acc2, Accx, ASR #16 ;//[0 c3 0 c1] = [0 c1 0 c0] [0 c3 0 c2] 205 STR Acc2, [pDst, dstStep] 206 STR Temp6, [pDst], #4 207 PKHBT Temp6, Accy, Acc3, LSL #16 ;//[0 d2 0 d0] = [0 d3 0 d2] [0 d1 0 d0] 208 PKHTB Acc3, Acc3, Accy, ASR #16 ;//[0 d3 0 d1] = [0 d1 0 d0] [0 d3 0 d2] 209 STR Acc3, [pDst, dstStep] 210 STR Temp6, [pDst], #-12 211 ADD pDst, pDst, dstStep, LSL #1 212 B AfterStore 213 214NoProcessing 215 M_STRD Acc0, Acc1, pTempResult1 216 M_STRD Acc2, Acc3, pTempResult2 217AfterStore 218 SUBS Counter, Counter, #1 ;// Loop till height is 10 219 ADD pSrc, pSrc, srcStep, LSL #1 220 BPL HeightLoop 221 222 STR Acc0, [pDst], #4 ;//[0 a1 0 a0] 223 STR Acc1, [pDst], #4 224 STR Acc2, [pDst], #4 225 STR Acc3, [pDst], #-12 226 227 ;// 228 ;// Horizontal interpolation using multiplication 229 ;// 230 231 SUB pSrc, pDst, dstStep, LSL #2 232 MOV srcStep, #16 233 M_LDRD pDst, dstStep, ppDstArgs 234 235 MOV Counter, #4 236 LDR r0x0014fffb, =0x0014fffb 237 LDR r0x00140001, =0x00140001 238 239HeightLoop1 240 M_STR Counter, pCounter 241 242 M_LDR ValCA, [pSrc], srcStep ;// Load [0 c 0 a] 243 M_LDR ValDB, [pSrc], srcStep ;// Load [0 d 0 b] 244 M_LDR ValGE, [pSrc], srcStep ;// Load [0 g 0 e] 245 M_LDR ValHF, [pSrc], srcStep ;// Load [0 h 0 f] 246 247 248 ;// Acc0 = smuad ([0 20 0 1], add([0 c 0 a] + [0 d 0 f])) - (5 * (b + e)) 249 ;// Acc1 = smuad ([0 20 0 1], add([0 e 0 g] + [0 d 0 b])) - (5 * (c + f)) 250 ;// Acc2 = smuad ([0 1 0 20], add([0 c 0 e] + [0 h 0 f])) - (5 * (d + g)) 251 ;// Acc3 = smuad ([0 20 0 1], add([0 d 0 f] + [0 i 0 g])) - (5 * (e + h)) 252 253 SMUAD Acc0, ValCA, r0x00140001 ;// Acc0 = [0 c 0 a] * [0 20 0 1] 254 SMUAD Acc1, ValDB, r0x00140001 ;// Acc1 = [0 c 0 a] * [0 20 0 1] 255 SMUADX Acc2, ValGE, r0x0014fffb ;// Acc2 = [0 g 0 e] * [0 20 0 -5] 256 SMUAD Acc3, ValGE, r0x0014fffb ;// Acc3 = [0 g 0 e] * [0 20 0 -5] 257 258 SMLAD Acc0, ValDB, r0x0014fffb, Acc0 ;// Acc0 += [0 d 0 b] * [0 20 0 -5] 259 SMLADX Acc1, ValGE, r0x00140001, Acc1 ;// Acc1 += [0 g 0 e] * [0 20 0 1] 260 SMLADX Acc2, ValHF, r0x00140001, Acc2 ;// Acc2 += [0 h 0 f] * [0 20 0 1] 261 SMLADX Acc3, ValHF, r0x0014fffb, Acc3 ;// Acc3 += [0 h 0 f] * [0 20 0 -5] 262 263 SMLABB Acc0, ValGE, r0x0014fffb, Acc0 ;// Acc0 += [0 g 0 e] * [0 0 0 -5] 264 SMLATB Acc1, ValCA, r0x0014fffb, Acc1 ;// Acc1 += [0 d 0 b] * [0 0 0 -5] 265 SMLATB Acc2, ValCA, r0x00140001, Acc2 ;// Acc2 += [0 c 0 a] * [0 0 0 1] 266 SMLATB Acc3, ValDB, r0x00140001, Acc3 ;// Acc3 += [0 c 0 a] * [0 0 0 1] 267 268 LDRH ValCA, [pSrc], #4 ;// 8 = srcStep - 16 269 SMLABB Acc0, ValHF, r0x00140001, Acc0 ;// Acc0 += [0 h 0 f] * [0 0 0 1] 270 SMLABB Acc1, ValHF, r0x0014fffb, Acc1 ;// Acc1 += [0 h 0 f] * [0 0 0 -5] 271 SMLATB Acc2, ValDB, r0x0014fffb, Acc2 ;// Acc2 += [0 d 0 b] * [0 0 0 -5] 272 SMLABB Acc3, ValCA, r0x00140001, Acc3 ;// Acc3 += [0 d 0 b] * [0 0 0 1] 273 274 LDR r0x0001fc00, =0x0001fc00 ;// (0xff * 16 * 32) - 512 275 SUB Acc0, Acc0, r0x0001fc00 276 SUB Acc1, Acc1, r0x0001fc00 277 SUB Acc2, Acc2, r0x0001fc00 278 SUB Acc3, Acc3, r0x0001fc00 279 280 USAT Acc0, #18, Acc0 281 USAT Acc1, #18, Acc1 282 USAT Acc2, #18, Acc2 283 USAT Acc3, #18, Acc3 284 285 MOV Acc0, Acc0, LSR #10 286 M_STRB Acc0, [pDst], dstStep 287 MOV Acc1, Acc1, LSR #10 288 M_STRB Acc1, [pDst], dstStep 289 MOV Acc2, Acc2, LSR #10 290 M_STRB Acc2, [pDst], dstStep 291 MOV Acc3, Acc3, LSR #10 292 M_STRB Acc3, [pDst], dstStep 293 294 295 M_LDR Counter, pCounter 296 SUB pDst, pDst, dstStep, LSL #2 297 SUB pSrc, pSrc, srcStep, LSL #2 298 ADD pDst, pDst, #1 299 SUBS Counter, Counter, #1 300 BGT HeightLoop1 301End 302 SUB pDst, pDst, #4 303 SUB pSrc, pSrc, #16 304 305 M_END 306 307 ENDIF 308 309 END 310 311