rsCpuIntrinsicConvolve3x3.cpp revision 3b35d775a777c36a178ce3fc97ff1e169aab3f1e
1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 18#include "rsCpuIntrinsic.h" 19#include "rsCpuIntrinsicInlines.h" 20 21using namespace android; 22using namespace android::renderscript; 23 24namespace android { 25namespace renderscript { 26 27 28class RsdCpuScriptIntrinsicConvolve3x3 : public RsdCpuScriptIntrinsic { 29public: 30 virtual void populateScript(Script *); 31 virtual void invokeFreeChildren(); 32 33 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength); 34 virtual void setGlobalObj(uint32_t slot, ObjectBase *data); 35 36 virtual ~RsdCpuScriptIntrinsicConvolve3x3(); 37 RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *); 38 39protected: 40 float mFp[16]; 41 short mIp[16]; 42 ObjectBaseRef<const Allocation> mAlloc; 43 ObjectBaseRef<const Element> mElement; 44 45 static void kernelU1(const RsForEachStubParamStruct *p, 46 uint32_t xstart, uint32_t xend, 47 uint32_t instep, uint32_t outstep); 48 static void kernelU2(const RsForEachStubParamStruct *p, 49 uint32_t xstart, uint32_t xend, 50 uint32_t instep, uint32_t outstep); 51 static void kernelU4(const RsForEachStubParamStruct *p, 52 uint32_t xstart, uint32_t xend, 53 uint32_t instep, uint32_t outstep); 54 static void kernelF1(const RsForEachStubParamStruct *p, 55 uint32_t xstart, uint32_t xend, 56 uint32_t instep, uint32_t outstep); 57 static void kernelF2(const RsForEachStubParamStruct *p, 58 uint32_t xstart, uint32_t xend, 59 uint32_t instep, uint32_t outstep); 60 static void kernelF4(const RsForEachStubParamStruct *p, 61 uint32_t xstart, uint32_t xend, 62 uint32_t instep, uint32_t outstep); 63}; 64 65} 66} 67 68 69void RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) { 70 rsAssert(slot == 1); 71 mAlloc.set(static_cast<Allocation *>(data)); 72} 73 74void RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data, 75 size_t dataLength) { 76 rsAssert(slot == 0); 77 memcpy (&mFp, data, dataLength); 78 for(int ct=0; ct < 9; ct++) { 79 if (mFp[ct] >= 0) { 80 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f); 81 } else { 82 mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f); 83 } 84 } 85} 86 87extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1, 88 const void *y2, const short *coef, uint32_t count); 89 90 91static void ConvolveOneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out, 92 const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, 93 const float* coeff) { 94 95 uint32_t x1 = rsMax((int32_t)x-1, 0); 96 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); 97 98 float4 px = convert_float4(py0[x1]) * coeff[0] + 99 convert_float4(py0[x]) * coeff[1] + 100 convert_float4(py0[x2]) * coeff[2] + 101 convert_float4(py1[x1]) * coeff[3] + 102 convert_float4(py1[x]) * coeff[4] + 103 convert_float4(py1[x2]) * coeff[5] + 104 convert_float4(py2[x1]) * coeff[6] + 105 convert_float4(py2[x]) * coeff[7] + 106 convert_float4(py2[x2]) * coeff[8]; 107 108 px = clamp(px, 0.f, 255.f); 109 uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w}; 110 *out = o; 111} 112 113static void ConvolveOneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out, 114 const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, 115 const float* coeff) { 116 117 uint32_t x1 = rsMax((int32_t)x-1, 0); 118 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); 119 120 float2 px = convert_float2(py0[x1]) * coeff[0] + 121 convert_float2(py0[x]) * coeff[1] + 122 convert_float2(py0[x2]) * coeff[2] + 123 convert_float2(py1[x1]) * coeff[3] + 124 convert_float2(py1[x]) * coeff[4] + 125 convert_float2(py1[x2]) * coeff[5] + 126 convert_float2(py2[x1]) * coeff[6] + 127 convert_float2(py2[x]) * coeff[7] + 128 convert_float2(py2[x2]) * coeff[8]; 129 130 px = clamp(px, 0.f, 255.f); 131 *out = convert_uchar2(px); 132} 133 134static void ConvolveOneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out, 135 const uchar *py0, const uchar *py1, const uchar *py2, 136 const float* coeff) { 137 138 uint32_t x1 = rsMax((int32_t)x-1, 0); 139 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); 140 141 float px = ((float)py0[x1]) * coeff[0] + 142 ((float)py0[x]) * coeff[1] + 143 ((float)py0[x2]) * coeff[2] + 144 ((float)py1[x1]) * coeff[3] + 145 ((float)py1[x]) * coeff[4] + 146 ((float)py1[x2]) * coeff[5] + 147 ((float)py2[x1]) * coeff[6] + 148 ((float)py2[x]) * coeff[7] + 149 ((float)py2[x2]) * coeff[8]; 150 *out = clamp(px, 0.f, 255.f); 151} 152 153static void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out, 154 const float4 *py0, const float4 *py1, const float4 *py2, 155 const float* coeff) { 156 157 uint32_t x1 = rsMax((int32_t)x-1, 0); 158 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); 159 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) + 160 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) + 161 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]); 162} 163 164static void ConvolveOneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out, 165 const float2 *py0, const float2 *py1, const float2 *py2, 166 const float* coeff) { 167 168 uint32_t x1 = rsMax((int32_t)x-1, 0); 169 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); 170 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) + 171 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) + 172 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]); 173} 174 175static void ConvolveOneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out, 176 const float *py0, const float *py1, const float *py2, 177 const float* coeff) { 178 179 uint32_t x1 = rsMax((int32_t)x-1, 0); 180 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); 181 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) + 182 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) + 183 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]); 184} 185 186void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsForEachStubParamStruct *p, 187 uint32_t xstart, uint32_t xend, 188 uint32_t instep, uint32_t outstep) { 189 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr; 190 191 if (!cp->mAlloc.get()) { 192 ALOGE("Convolve3x3 executed without input, skipping"); 193 return; 194 } 195 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 196 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 197 198 uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1)); 199 uint32_t y2 = rsMax((int32_t)p->y - 1, 0); 200 const uchar4 *py0 = (const uchar4 *)(pin + stride * y2); 201 const uchar4 *py1 = (const uchar4 *)(pin + stride * p->y); 202 const uchar4 *py2 = (const uchar4 *)(pin + stride * y1); 203 204 uchar4 *out = (uchar4 *)p->out; 205 uint32_t x1 = xstart; 206 uint32_t x2 = xend; 207 if(x1 == 0) { 208 ConvolveOneU4(p, 0, out, py0, py1, py2, cp->mFp); 209 x1 ++; 210 out++; 211 } 212 213 if(x2 > x1) { 214#if defined(ARCH_ARM_HAVE_NEON) 215 int32_t len = (x2 - x1 - 1) >> 1; 216 if(len > 0) { 217 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 218 x1 += len << 1; 219 out += len << 1; 220 } 221#endif 222 223 while(x1 != x2) { 224 ConvolveOneU4(p, x1, out, py0, py1, py2, cp->mFp); 225 out++; 226 x1++; 227 } 228 } 229} 230 231void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsForEachStubParamStruct *p, 232 uint32_t xstart, uint32_t xend, 233 uint32_t instep, uint32_t outstep) { 234 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr; 235 236 if (!cp->mAlloc.get()) { 237 ALOGE("Convolve3x3 executed without input, skipping"); 238 return; 239 } 240 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 241 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 242 243 uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1)); 244 uint32_t y2 = rsMax((int32_t)p->y - 1, 0); 245 const uchar2 *py0 = (const uchar2 *)(pin + stride * y2); 246 const uchar2 *py1 = (const uchar2 *)(pin + stride * p->y); 247 const uchar2 *py2 = (const uchar2 *)(pin + stride * y1); 248 249 uchar2 *out = (uchar2 *)p->out; 250 uint32_t x1 = xstart; 251 uint32_t x2 = xend; 252 if(x1 == 0) { 253 ConvolveOneU2(p, 0, out, py0, py1, py2, cp->mFp); 254 x1 ++; 255 out++; 256 } 257 258 if(x2 > x1) { 259#if 0//defined(ARCH_ARM_HAVE_NEON) 260 int32_t len = (x2 - x1 - 1) >> 1; 261 if(len > 0) { 262 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 263 x1 += len << 1; 264 out += len << 1; 265 } 266#endif 267 268 while(x1 != x2) { 269 ConvolveOneU2(p, x1, out, py0, py1, py2, cp->mFp); 270 out++; 271 x1++; 272 } 273 } 274} 275 276void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsForEachStubParamStruct *p, 277 uint32_t xstart, uint32_t xend, 278 uint32_t instep, uint32_t outstep) { 279 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr; 280 281 if (!cp->mAlloc.get()) { 282 ALOGE("Convolve3x3 executed without input, skipping"); 283 return; 284 } 285 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 286 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 287 288 uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1)); 289 uint32_t y2 = rsMax((int32_t)p->y - 1, 0); 290 const uchar *py0 = (const uchar *)(pin + stride * y2); 291 const uchar *py1 = (const uchar *)(pin + stride * p->y); 292 const uchar *py2 = (const uchar *)(pin + stride * y1); 293 294 uchar *out = (uchar *)p->out; 295 uint32_t x1 = xstart; 296 uint32_t x2 = xend; 297 if(x1 == 0) { 298 ConvolveOneU1(p, 0, out, py0, py1, py2, cp->mFp); 299 x1 ++; 300 out++; 301 } 302 303 if(x2 > x1) { 304#if 0//defined(ARCH_ARM_HAVE_NEON) 305 int32_t len = (x2 - x1 - 1) >> 1; 306 if(len > 0) { 307 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 308 x1 += len << 1; 309 out += len << 1; 310 } 311#endif 312 313 while(x1 != x2) { 314 ConvolveOneU1(p, x1, out, py0, py1, py2, cp->mFp); 315 out++; 316 x1++; 317 } 318 } 319} 320 321void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsForEachStubParamStruct *p, 322 uint32_t xstart, uint32_t xend, 323 uint32_t instep, uint32_t outstep) { 324 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr; 325 326 if (!cp->mAlloc.get()) { 327 ALOGE("Convolve3x3 executed without input, skipping"); 328 return; 329 } 330 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 331 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 332 333 uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1)); 334 uint32_t y2 = rsMax((int32_t)p->y - 1, 0); 335 const float4 *py0 = (const float4 *)(pin + stride * y2); 336 const float4 *py1 = (const float4 *)(pin + stride * p->y); 337 const float4 *py2 = (const float4 *)(pin + stride * y1); 338 339 float4 *out = (float4 *)p->out; 340 uint32_t x1 = xstart; 341 uint32_t x2 = xend; 342 if(x1 == 0) { 343 ConvolveOneF4(p, 0, out, py0, py1, py2, cp->mFp); 344 x1 ++; 345 out++; 346 } 347 348 if(x2 > x1) { 349#if 0//defined(ARCH_ARM_HAVE_NEON) 350 int32_t len = (x2 - x1 - 1) >> 1; 351 if(len > 0) { 352 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 353 x1 += len << 1; 354 out += len << 1; 355 } 356#endif 357 358 while(x1 != x2) { 359 ConvolveOneF4(p, x1, out, py0, py1, py2, cp->mFp); 360 out++; 361 x1++; 362 } 363 } 364} 365 366void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsForEachStubParamStruct *p, 367 uint32_t xstart, uint32_t xend, 368 uint32_t instep, uint32_t outstep) { 369 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr; 370 371 if (!cp->mAlloc.get()) { 372 ALOGE("Convolve3x3 executed without input, skipping"); 373 return; 374 } 375 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 376 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 377 378 uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1)); 379 uint32_t y2 = rsMax((int32_t)p->y - 1, 0); 380 const float2 *py0 = (const float2 *)(pin + stride * y2); 381 const float2 *py1 = (const float2 *)(pin + stride * p->y); 382 const float2 *py2 = (const float2 *)(pin + stride * y1); 383 384 float2 *out = (float2 *)p->out; 385 uint32_t x1 = xstart; 386 uint32_t x2 = xend; 387 if(x1 == 0) { 388 ConvolveOneF2(p, 0, out, py0, py1, py2, cp->mFp); 389 x1 ++; 390 out++; 391 } 392 393 if(x2 > x1) { 394#if 0//defined(ARCH_ARM_HAVE_NEON) 395 int32_t len = (x2 - x1 - 1) >> 1; 396 if(len > 0) { 397 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 398 x1 += len << 1; 399 out += len << 1; 400 } 401#endif 402 403 while(x1 != x2) { 404 ConvolveOneF2(p, x1, out, py0, py1, py2, cp->mFp); 405 out++; 406 x1++; 407 } 408 } 409} 410void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsForEachStubParamStruct *p, 411 uint32_t xstart, uint32_t xend, 412 uint32_t instep, uint32_t outstep) { 413 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr; 414 415 if (!cp->mAlloc.get()) { 416 ALOGE("Convolve3x3 executed without input, skipping"); 417 return; 418 } 419 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 420 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 421 422 uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1)); 423 uint32_t y2 = rsMax((int32_t)p->y - 1, 0); 424 const float *py0 = (const float *)(pin + stride * y2); 425 const float *py1 = (const float *)(pin + stride * p->y); 426 const float *py2 = (const float *)(pin + stride * y1); 427 428 float *out = (float *)p->out; 429 uint32_t x1 = xstart; 430 uint32_t x2 = xend; 431 if(x1 == 0) { 432 ConvolveOneF1(p, 0, out, py0, py1, py2, cp->mFp); 433 x1 ++; 434 out++; 435 } 436 437 if(x2 > x1) { 438#if 0//defined(ARCH_ARM_HAVE_NEON) 439 int32_t len = (x2 - x1 - 1) >> 1; 440 if(len > 0) { 441 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 442 x1 += len << 1; 443 out += len << 1; 444 } 445#endif 446 447 while(x1 != x2) { 448 ConvolveOneF1(p, x1, out, py0, py1, py2, cp->mFp); 449 out++; 450 x1++; 451 } 452 } 453} 454 455RsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3( 456 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 457 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) { 458 459 if (e->getType() == RS_TYPE_FLOAT_32) { 460 switch(e->getVectorSize()) { 461 case 1: 462 mRootPtr = &kernelF1; 463 break; 464 case 2: 465 mRootPtr = &kernelF2; 466 break; 467 case 3: 468 case 4: 469 mRootPtr = &kernelF4; 470 break; 471 } 472 } else { 473 switch(e->getVectorSize()) { 474 case 1: 475 mRootPtr = &kernelU1; 476 break; 477 case 2: 478 mRootPtr = &kernelU2; 479 break; 480 case 3: 481 case 4: 482 mRootPtr = &kernelU4; 483 break; 484 } 485 } 486 for(int ct=0; ct < 9; ct++) { 487 mFp[ct] = 1.f / 9.f; 488 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f); 489 } 490} 491 492RsdCpuScriptIntrinsicConvolve3x3::~RsdCpuScriptIntrinsicConvolve3x3() { 493} 494 495void RsdCpuScriptIntrinsicConvolve3x3::populateScript(Script *s) { 496 s->mHal.info.exportedVariableCount = 2; 497} 498 499void RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() { 500 mAlloc.clear(); 501} 502 503 504RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) { 505 506 return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e); 507} 508 509 510