1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 18#include "rsCpuIntrinsic.h" 19#include "rsCpuIntrinsicInlines.h" 20 21namespace android { 22namespace renderscript { 23 24 25class RsdCpuScriptIntrinsicConvolve3x3 : public RsdCpuScriptIntrinsic { 26public: 27 void populateScript(Script *) override; 28 void invokeFreeChildren() override; 29 30 void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override; 31 void setGlobalObj(uint32_t slot, ObjectBase *data) override; 32 33 ~RsdCpuScriptIntrinsicConvolve3x3() override; 34 RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *); 35 36protected: 37 float mFp[16]; 38 short mIp[16]; 39 ObjectBaseRef<const Allocation> mAlloc; 40 ObjectBaseRef<const Element> mElement; 41 42 static void kernelU1(const RsExpandKernelDriverInfo *info, 43 uint32_t xstart, uint32_t xend, 44 uint32_t outstep); 45 static void kernelU2(const RsExpandKernelDriverInfo *info, 46 uint32_t xstart, uint32_t xend, 47 uint32_t outstep); 48 static void kernelU4(const RsExpandKernelDriverInfo *info, 49 uint32_t xstart, uint32_t xend, 50 uint32_t outstep); 51 static void kernelF1(const RsExpandKernelDriverInfo *info, 52 uint32_t xstart, uint32_t xend, 53 uint32_t outstep); 54 static void kernelF2(const RsExpandKernelDriverInfo *info, 55 uint32_t xstart, uint32_t xend, 56 uint32_t outstep); 57 static void kernelF4(const RsExpandKernelDriverInfo *info, 58 uint32_t xstart, uint32_t xend, 59 uint32_t outstep); 60}; 61 62void RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) { 63 rsAssert(slot == 1); 64 mAlloc.set(static_cast<Allocation *>(data)); 65} 66 67void RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data, 68 size_t dataLength) { 69 rsAssert(slot == 0); 70 memcpy (&mFp, data, dataLength); 71 for(int ct=0; ct < 9; ct++) { 72 if (mFp[ct] >= 0) { 73 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f); 74 } else { 75 mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f); 76 } 77 } 78} 79 80extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1, 81 const void *y2, const short *coef, uint32_t count); 82 83 84static void ConvolveOneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out, 85 const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, 86 const float* coeff) { 87 88 uint32_t x1 = rsMax((int32_t)x-1, 0); 89 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1); 90 91 float4 px = convert_float4(py0[x1]) * coeff[0] + 92 convert_float4(py0[x]) * coeff[1] + 93 convert_float4(py0[x2]) * coeff[2] + 94 convert_float4(py1[x1]) * coeff[3] + 95 convert_float4(py1[x]) * coeff[4] + 96 convert_float4(py1[x2]) * coeff[5] + 97 convert_float4(py2[x1]) * coeff[6] + 98 convert_float4(py2[x]) * coeff[7] + 99 convert_float4(py2[x2]) * coeff[8]; 100 101 px = clamp(px + 0.5f, 0.f, 255.f); 102 uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w}; 103 *out = o; 104} 105 106static void ConvolveOneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out, 107 const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, 108 const float* coeff) { 109 110 uint32_t x1 = rsMax((int32_t)x-1, 0); 111 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1); 112 113 float2 px = convert_float2(py0[x1]) * coeff[0] + 114 convert_float2(py0[x]) * coeff[1] + 115 convert_float2(py0[x2]) * coeff[2] + 116 convert_float2(py1[x1]) * coeff[3] + 117 convert_float2(py1[x]) * coeff[4] + 118 convert_float2(py1[x2]) * coeff[5] + 119 convert_float2(py2[x1]) * coeff[6] + 120 convert_float2(py2[x]) * coeff[7] + 121 convert_float2(py2[x2]) * coeff[8]; 122 123 px = clamp(px + 0.5f, 0.f, 255.f); 124 *out = convert_uchar2(px); 125} 126 127static void ConvolveOneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out, 128 const uchar *py0, const uchar *py1, const uchar *py2, 129 const float* coeff) { 130 131 uint32_t x1 = rsMax((int32_t)x-1, 0); 132 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1); 133 134 float px = ((float)py0[x1]) * coeff[0] + 135 ((float)py0[x]) * coeff[1] + 136 ((float)py0[x2]) * coeff[2] + 137 ((float)py1[x1]) * coeff[3] + 138 ((float)py1[x]) * coeff[4] + 139 ((float)py1[x2]) * coeff[5] + 140 ((float)py2[x1]) * coeff[6] + 141 ((float)py2[x]) * coeff[7] + 142 ((float)py2[x2]) * coeff[8]; 143 *out = clamp(px + 0.5f, 0.f, 255.f); 144} 145 146static void ConvolveOneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out, 147 const float4 *py0, const float4 *py1, const float4 *py2, 148 const float* coeff) { 149 150 uint32_t x1 = rsMax((int32_t)x-1, 0); 151 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1); 152 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) + 153 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) + 154 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]); 155} 156 157static void ConvolveOneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out, 158 const float2 *py0, const float2 *py1, const float2 *py2, 159 const float* coeff) { 160 161 uint32_t x1 = rsMax((int32_t)x-1, 0); 162 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1); 163 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) + 164 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) + 165 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]); 166} 167 168static void ConvolveOneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out, 169 const float *py0, const float *py1, const float *py2, 170 const float* coeff) { 171 172 uint32_t x1 = rsMax((int32_t)x-1, 0); 173 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1); 174 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) + 175 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) + 176 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]); 177} 178 179void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsExpandKernelDriverInfo *info, 180 uint32_t xstart, uint32_t xend, 181 uint32_t outstep) { 182 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr; 183 184 if (!cp->mAlloc.get()) { 185 ALOGE("Convolve3x3 executed without input, skipping"); 186 return; 187 } 188 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 189 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 190 191 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1)); 192 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0); 193 const uchar4 *py0 = (const uchar4 *)(pin + stride * y2); 194 const uchar4 *py1 = (const uchar4 *)(pin + stride * info->current.y); 195 const uchar4 *py2 = (const uchar4 *)(pin + stride * y1); 196 197 uchar4 *out = (uchar4 *)info->outPtr[0]; 198 uint32_t x1 = xstart; 199 uint32_t x2 = xend; 200 if(x1 == 0) { 201 ConvolveOneU4(info, 0, out, py0, py1, py2, cp->mFp); 202 x1 ++; 203 out++; 204 } 205 206 if(x2 > x1) { 207#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3) 208 if (gArchUseSIMD) { 209 int32_t len = (x2 - x1 - 1) >> 1; 210 if(len > 0) { 211 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 212 x1 += len << 1; 213 out += len << 1; 214 } 215 } 216#endif 217 218 while(x1 != x2) { 219 ConvolveOneU4(info, x1, out, py0, py1, py2, cp->mFp); 220 out++; 221 x1++; 222 } 223 } 224} 225 226void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsExpandKernelDriverInfo *info, 227 uint32_t xstart, uint32_t xend, 228 uint32_t outstep) { 229 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr; 230 231 if (!cp->mAlloc.get()) { 232 ALOGE("Convolve3x3 executed without input, skipping"); 233 return; 234 } 235 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 236 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 237 238 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1)); 239 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0); 240 const uchar2 *py0 = (const uchar2 *)(pin + stride * y2); 241 const uchar2 *py1 = (const uchar2 *)(pin + stride * info->current.y); 242 const uchar2 *py2 = (const uchar2 *)(pin + stride * y1); 243 244 uchar2 *out = (uchar2 *)info->outPtr[0]; 245 uint32_t x1 = xstart; 246 uint32_t x2 = xend; 247 if(x1 == 0) { 248 ConvolveOneU2(info, 0, out, py0, py1, py2, cp->mFp); 249 x1 ++; 250 out++; 251 } 252 253 if(x2 > x1) { 254#if 0//defined(ARCH_ARM_HAVE_NEON) 255 int32_t len = (x2 - x1 - 1) >> 1; 256 if(len > 0) { 257 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 258 x1 += len << 1; 259 out += len << 1; 260 } 261#endif 262 263 while(x1 != x2) { 264 ConvolveOneU2(info, x1, out, py0, py1, py2, cp->mFp); 265 out++; 266 x1++; 267 } 268 } 269} 270 271void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsExpandKernelDriverInfo *info, 272 uint32_t xstart, uint32_t xend, 273 uint32_t outstep) { 274 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr; 275 276 if (!cp->mAlloc.get()) { 277 ALOGE("Convolve3x3 executed without input, skipping"); 278 return; 279 } 280 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 281 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 282 283 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1)); 284 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0); 285 const uchar *py0 = (const uchar *)(pin + stride * y2); 286 const uchar *py1 = (const uchar *)(pin + stride * info->current.y); 287 const uchar *py2 = (const uchar *)(pin + stride * y1); 288 289 uchar *out = (uchar *)info->outPtr[0]; 290 uint32_t x1 = xstart; 291 uint32_t x2 = xend; 292 if(x1 == 0) { 293 ConvolveOneU1(info, 0, out, py0, py1, py2, cp->mFp); 294 x1 ++; 295 out++; 296 } 297 298 if(x2 > x1) { 299#if 0//defined(ARCH_ARM_HAVE_NEON) 300 int32_t len = (x2 - x1 - 1) >> 1; 301 if(len > 0) { 302 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 303 x1 += len << 1; 304 out += len << 1; 305 } 306#endif 307 308 while(x1 != x2) { 309 ConvolveOneU1(info, x1, out, py0, py1, py2, cp->mFp); 310 out++; 311 x1++; 312 } 313 } 314} 315 316void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsExpandKernelDriverInfo *info, 317 uint32_t xstart, uint32_t xend, 318 uint32_t outstep) { 319 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr; 320 321 if (!cp->mAlloc.get()) { 322 ALOGE("Convolve3x3 executed without input, skipping"); 323 return; 324 } 325 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 326 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 327 328 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1)); 329 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0); 330 const float4 *py0 = (const float4 *)(pin + stride * y2); 331 const float4 *py1 = (const float4 *)(pin + stride * info->current.y); 332 const float4 *py2 = (const float4 *)(pin + stride * y1); 333 334 float4 *out = (float4 *)info->outPtr[0]; 335 uint32_t x1 = xstart; 336 uint32_t x2 = xend; 337 if(x1 == 0) { 338 ConvolveOneF4(info, 0, out, py0, py1, py2, cp->mFp); 339 x1 ++; 340 out++; 341 } 342 343 if(x2 > x1) { 344#if 0//defined(ARCH_ARM_HAVE_NEON) 345 int32_t len = (x2 - x1 - 1) >> 1; 346 if(len > 0) { 347 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 348 x1 += len << 1; 349 out += len << 1; 350 } 351#endif 352 353 while(x1 != x2) { 354 ConvolveOneF4(info, x1, out, py0, py1, py2, cp->mFp); 355 out++; 356 x1++; 357 } 358 } 359} 360 361void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsExpandKernelDriverInfo *info, 362 uint32_t xstart, uint32_t xend, 363 uint32_t outstep) { 364 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr; 365 366 if (!cp->mAlloc.get()) { 367 ALOGE("Convolve3x3 executed without input, skipping"); 368 return; 369 } 370 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 371 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 372 373 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1)); 374 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0); 375 const float2 *py0 = (const float2 *)(pin + stride * y2); 376 const float2 *py1 = (const float2 *)(pin + stride * info->current.y); 377 const float2 *py2 = (const float2 *)(pin + stride * y1); 378 379 float2 *out = (float2 *)info->outPtr[0]; 380 uint32_t x1 = xstart; 381 uint32_t x2 = xend; 382 if(x1 == 0) { 383 ConvolveOneF2(info, 0, out, py0, py1, py2, cp->mFp); 384 x1 ++; 385 out++; 386 } 387 388 if(x2 > x1) { 389#if 0//defined(ARCH_ARM_HAVE_NEON) 390 int32_t len = (x2 - x1 - 1) >> 1; 391 if(len > 0) { 392 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 393 x1 += len << 1; 394 out += len << 1; 395 } 396#endif 397 398 while(x1 != x2) { 399 ConvolveOneF2(info, x1, out, py0, py1, py2, cp->mFp); 400 out++; 401 x1++; 402 } 403 } 404} 405void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsExpandKernelDriverInfo *info, 406 uint32_t xstart, uint32_t xend, 407 uint32_t outstep) { 408 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr; 409 410 if (!cp->mAlloc.get()) { 411 ALOGE("Convolve3x3 executed without input, skipping"); 412 return; 413 } 414 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 415 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 416 417 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1)); 418 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0); 419 const float *py0 = (const float *)(pin + stride * y2); 420 const float *py1 = (const float *)(pin + stride * info->current.y); 421 const float *py2 = (const float *)(pin + stride * y1); 422 423 float *out = (float *)info->outPtr[0]; 424 uint32_t x1 = xstart; 425 uint32_t x2 = xend; 426 if(x1 == 0) { 427 ConvolveOneF1(info, 0, out, py0, py1, py2, cp->mFp); 428 x1 ++; 429 out++; 430 } 431 432 if(x2 > x1) { 433#if 0//defined(ARCH_ARM_HAVE_NEON) 434 int32_t len = (x2 - x1 - 1) >> 1; 435 if(len > 0) { 436 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 437 x1 += len << 1; 438 out += len << 1; 439 } 440#endif 441 442 while(x1 != x2) { 443 ConvolveOneF1(info, x1, out, py0, py1, py2, cp->mFp); 444 out++; 445 x1++; 446 } 447 } 448} 449 450RsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3( 451 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 452 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) { 453 454 if (e->getType() == RS_TYPE_FLOAT_32) { 455 switch(e->getVectorSize()) { 456 case 1: 457 mRootPtr = &kernelF1; 458 break; 459 case 2: 460 mRootPtr = &kernelF2; 461 break; 462 case 3: 463 case 4: 464 mRootPtr = &kernelF4; 465 break; 466 } 467 } else { 468 switch(e->getVectorSize()) { 469 case 1: 470 mRootPtr = &kernelU1; 471 break; 472 case 2: 473 mRootPtr = &kernelU2; 474 break; 475 case 3: 476 case 4: 477 mRootPtr = &kernelU4; 478 break; 479 } 480 } 481 for(int ct=0; ct < 9; ct++) { 482 mFp[ct] = 1.f / 9.f; 483 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f); 484 } 485} 486 487RsdCpuScriptIntrinsicConvolve3x3::~RsdCpuScriptIntrinsicConvolve3x3() { 488} 489 490void RsdCpuScriptIntrinsicConvolve3x3::populateScript(Script *s) { 491 s->mHal.info.exportedVariableCount = 2; 492} 493 494void RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() { 495 mAlloc.clear(); 496} 497 498RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) { 499 500 return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e); 501} 502 503} // namespace renderscript 504} // namespace android 505