rsCpuScriptGroup2.cpp revision bd0af2d161e36e52e6782ccb2d15dd5a36467704
1#include "rsCpuScriptGroup2.h" 2 3#include <dlfcn.h> 4#include <stdio.h> 5#include <stdlib.h> 6#include <unistd.h> 7 8#include <set> 9#include <sstream> 10#include <string> 11#include <vector> 12 13#ifndef RS_COMPATIBILITY_LIB 14#include "bcc/Config/Config.h" 15#endif 16 17#include "cpu_ref/rsCpuCore.h" 18#include "rsClosure.h" 19#include "rsContext.h" 20#include "rsCpuCore.h" 21#include "rsCpuExecutable.h" 22#include "rsCpuScript.h" 23#include "rsScript.h" 24#include "rsScriptGroup2.h" 25#include "rsScriptIntrinsic.h" 26 27using std::string; 28using std::vector; 29 30namespace android { 31namespace renderscript { 32 33namespace { 34 35const size_t DefaultKernelArgCount = 2; 36 37void groupRoot(const RsExpandKernelDriverInfo *kinfo, uint32_t xstart, 38 uint32_t xend, uint32_t outstep) { 39 const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kinfo->usr; 40 RsExpandKernelDriverInfo *mutable_kinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo); 41 42 const size_t oldInLen = mutable_kinfo->inLen; 43 44 decltype(mutable_kinfo->inStride) oldInStride; 45 memcpy(&oldInStride, &mutable_kinfo->inStride, sizeof(oldInStride)); 46 47 for (CPUClosure* cpuClosure : closures) { 48 const Closure* closure = cpuClosure->mClosure; 49 50 // There had better be enough space in mutable_kinfo 51 rsAssert(closure->mNumArg <= RS_KERNEL_INPUT_LIMIT); 52 53 for (size_t i = 0; i < closure->mNumArg; i++) { 54 const void* arg = closure->mArgs[i]; 55 const Allocation* a = (const Allocation*)arg; 56 const uint32_t eStride = a->mHal.state.elementSizeBytes; 57 const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) + 58 eStride * xstart; 59 if (kinfo->dim.y > 1) { 60 ptr += a->mHal.drvState.lod[0].stride * kinfo->current.y; 61 } 62 mutable_kinfo->inPtr[i] = ptr; 63 mutable_kinfo->inStride[i] = eStride; 64 } 65 mutable_kinfo->inLen = closure->mNumArg; 66 67 const Allocation* out = closure->mReturnValue; 68 const uint32_t ostep = out->mHal.state.elementSizeBytes; 69 const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) + 70 ostep * xstart; 71 if (kinfo->dim.y > 1) { 72 ptr += out->mHal.drvState.lod[0].stride * kinfo->current.y; 73 } 74 75 rsAssert(kinfo->outLen <= 1); 76 mutable_kinfo->outPtr[0] = const_cast<uint8_t*>(ptr); 77 78 cpuClosure->mFunc(kinfo, xstart, xend, ostep); 79 } 80 81 mutable_kinfo->inLen = oldInLen; 82 memcpy(&mutable_kinfo->inStride, &oldInStride, sizeof(oldInStride)); 83} 84 85} // namespace 86 87Batch::Batch(CpuScriptGroup2Impl* group, const char* name) : 88 mGroup(group), mFunc(nullptr) { 89 mName = strndup(name, strlen(name)); 90} 91 92Batch::~Batch() { 93 for (CPUClosure* c : mClosures) { 94 delete c; 95 } 96 free(mName); 97} 98 99bool Batch::conflict(CPUClosure* cpuClosure) const { 100 if (mClosures.empty()) { 101 return false; 102 } 103 104 const Closure* closure = cpuClosure->mClosure; 105 106 if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) { 107 // An invoke should be in a batch by itself, so it conflicts with any other 108 // closure. 109 return true; 110 } 111 112 const auto& globalDeps = closure->mGlobalDeps; 113 const auto& argDeps = closure->mArgDeps; 114 115 for (CPUClosure* c : mClosures) { 116 const Closure* batched = c->mClosure; 117 if (globalDeps.find(batched) != globalDeps.end()) { 118 return true; 119 } 120 const auto& it = argDeps.find(batched); 121 if (it != argDeps.end()) { 122 const auto& args = (*it).second; 123 for (const auto &p1 : *args) { 124 if (p1.second.get() != nullptr) { 125 return true; 126 } 127 } 128 } 129 } 130 131 return false; 132} 133 134CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, 135 const ScriptGroupBase *sg) : 136 mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)), 137 mExecutable(nullptr), mScriptObj(nullptr) { 138 rsAssert(!mGroup->mClosures.empty()); 139 140 Batch* batch = new Batch(this, "Batch0"); 141 int i = 0; 142 for (Closure* closure: mGroup->mClosures) { 143 CPUClosure* cc; 144 const IDBase* funcID = closure->mFunctionID.get(); 145 RsdCpuScriptImpl* si = 146 (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript); 147 if (closure->mIsKernel) { 148 MTLaunchStruct mtls; 149 si->forEachKernelSetup(funcID->mSlot, &mtls); 150 cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel); 151 } else { 152 cc = new CPUClosure(closure, si); 153 } 154 155 if (batch->conflict(cc)) { 156 mBatches.push_back(batch); 157 std::stringstream ss; 158 ss << "Batch" << ++i; 159 batch = new Batch(this, ss.str().c_str()); 160 } 161 162 batch->mClosures.push_back(cc); 163 } 164 165 rsAssert(!batch->mClosures.empty()); 166 mBatches.push_back(batch); 167 168#ifndef RS_COMPATIBILITY_LIB 169 compile(mGroup->mCacheDir); 170 if (mScriptObj != nullptr && mExecutable != nullptr) { 171 for (Batch* batch : mBatches) { 172 batch->resolveFuncPtr(mScriptObj); 173 } 174 } 175#endif // RS_COMPATIBILITY_LIB 176} 177 178void Batch::resolveFuncPtr(void* sharedObj) { 179 std::string funcName(mName); 180 if (mClosures.front()->mClosure->mIsKernel) { 181 funcName.append(".expand"); 182 } 183 mFunc = dlsym(sharedObj, funcName.c_str()); 184 rsAssert (mFunc != nullptr); 185} 186 187CpuScriptGroup2Impl::~CpuScriptGroup2Impl() { 188 for (Batch* batch : mBatches) { 189 delete batch; 190 } 191 delete mExecutable; 192 // TODO: move this dlclose into ~ScriptExecutable(). 193 if (mScriptObj != nullptr) { 194 dlclose(mScriptObj); 195 } 196} 197 198namespace { 199 200#ifndef RS_COMPATIBILITY_LIB 201 202string getCoreLibPath(Context* context, string* coreLibRelaxedPath) { 203 *coreLibRelaxedPath = ""; 204 205 // If we're debugging, use the debug library. 206 if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) { 207 return SYSLIBPATH"/libclcore_debug.bc"; 208 } 209 210 // Check for a platform specific library 211 212#if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON) 213 // NEON-capable ARMv7a devices can use an accelerated math library 214 // for all reduced precision scripts. 215 // ARMv8 does not use NEON, as ASIMD can be used with all precision 216 // levels. 217 *coreLibRelaxedPath = SYSLIBPATH"/libclcore_neon.bc"; 218#endif 219 220#if defined(__i386__) || defined(__x86_64__) 221 // x86 devices will use an optimized library. 222 return SYSLIBPATH"/libclcore_x86.bc"; 223#else 224 return SYSLIBPATH"/libclcore.bc"; 225#endif 226} 227 228string getFileName(string path) { 229 unsigned found = path.find_last_of("/\\"); 230 return path.substr(found + 1); 231} 232 233void setupCompileArguments( 234 const vector<string>& inputs, const vector<string>& kernelBatches, 235 const vector<string>& invokeBatches, 236 const string& output_dir, const string& output_filename, 237 const string& coreLibPath, const string& coreLibRelaxedPath, 238 vector<const char*>* args) { 239 args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH); 240 args->push_back("-fPIC"); 241 args->push_back("-embedRSInfo"); 242 args->push_back("-mtriple"); 243 args->push_back(DEFAULT_TARGET_TRIPLE_STRING); 244 args->push_back("-bclib"); 245 args->push_back(coreLibPath.c_str()); 246 args->push_back("-bclib_relaxed"); 247 args->push_back(coreLibRelaxedPath.c_str()); 248 for (const string& input : inputs) { 249 args->push_back(input.c_str()); 250 } 251 for (const string& batch : kernelBatches) { 252 args->push_back("-merge"); 253 args->push_back(batch.c_str()); 254 } 255 for (const string& batch : invokeBatches) { 256 args->push_back("-invoke"); 257 args->push_back(batch.c_str()); 258 } 259 args->push_back("-output_path"); 260 args->push_back(output_dir.c_str()); 261 args->push_back("-o"); 262 args->push_back(output_filename.c_str()); 263 args->push_back(nullptr); 264} 265 266void generateSourceSlot(const Closure& closure, 267 const std::vector<std::string>& inputs, 268 std::stringstream& ss) { 269 const IDBase* funcID = (const IDBase*)closure.mFunctionID.get(); 270 const Script* script = funcID->mScript; 271 272 rsAssert (!script->isIntrinsic()); 273 274 const RsdCpuScriptImpl *cpuScript = 275 (const RsdCpuScriptImpl*)script->mHal.drv; 276 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 277 278 const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) - 279 inputs.begin(); 280 281 ss << index << "," << funcID->mSlot << "."; 282} 283 284#endif // RS_COMPATIBILTY_LIB 285 286} // anonymous namespace 287 288void CpuScriptGroup2Impl::compile(const char* cacheDir) { 289#ifndef RS_COMPATIBILITY_LIB 290 if (mGroup->mClosures.size() < 2) { 291 return; 292 } 293 294 //===--------------------------------------------------------------------===// 295 // Fuse the input kernels and generate native code in an object file 296 //===--------------------------------------------------------------------===// 297 298 std::set<string> inputSet; 299 for (Closure* closure : mGroup->mClosures) { 300 const Script* script = closure->mFunctionID.get()->mScript; 301 302 // If any script is an intrinsic, give up trying fusing the kernels. 303 if (script->isIntrinsic()) { 304 return; 305 } 306 307 const RsdCpuScriptImpl *cpuScript = 308 (const RsdCpuScriptImpl*)script->mHal.drv; 309 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 310 inputSet.insert(bitcodeFilename); 311 } 312 313 std::vector<string> inputs(inputSet.begin(), inputSet.end()); 314 315 std::vector<string> kernelBatches; 316 std::vector<string> invokeBatches; 317 318 int i = 0; 319 for (const auto& batch : mBatches) { 320 rsAssert(batch->size() > 0); 321 322 std::stringstream ss; 323 ss << batch->mName << ":"; 324 325 if (!batch->mClosures.front()->mClosure->mIsKernel) { 326 rsAssert(batch->size() == 1); 327 generateSourceSlot(*batch->mClosures.front()->mClosure, inputs, ss); 328 invokeBatches.push_back(ss.str()); 329 } else { 330 for (const auto& cpuClosure : batch->mClosures) { 331 generateSourceSlot(*cpuClosure->mClosure, inputs, ss); 332 } 333 kernelBatches.push_back(ss.str()); 334 } 335 } 336 337 rsAssert(cacheDir != nullptr); 338 string objFilePath(cacheDir); 339 objFilePath.append("/fusedXXXXXX.o"); 340 // Find unique object file name, to make following file names unique. 341 int tempfd = mkstemps(&objFilePath[0], 2); 342 if (tempfd == -1) { 343 return; 344 } 345 TEMP_FAILURE_RETRY(close(tempfd)); 346 347 string outputFileName = getFileName(objFilePath.substr(0, objFilePath.size() - 2)); 348 string coreLibRelaxedPath; 349 const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(), 350 &coreLibRelaxedPath); 351 vector<const char*> arguments; 352 string output_dir(cacheDir); 353 setupCompileArguments(inputs, kernelBatches, invokeBatches, output_dir, 354 outputFileName, coreLibPath, coreLibRelaxedPath, &arguments); 355 356 bool compiled = rsuExecuteCommand(RsdCpuScriptImpl::BCC_EXE_PATH, 357 arguments.size()-1, 358 arguments.data()); 359 if (!compiled) { 360 unlink(objFilePath.c_str()); 361 return; 362 } 363 364 //===--------------------------------------------------------------------===// 365 // Create and load the shared lib 366 //===--------------------------------------------------------------------===// 367 368 const char* resName = outputFileName.c_str(); 369 370 if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) { 371 ALOGE("Failed to link object file '%s'", resName); 372 return; 373 } 374 375 mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName); 376 if (mScriptObj == nullptr) { 377 ALOGE("Unable to load '%s'", resName); 378 return; 379 } 380 381 mExecutable = ScriptExecutable::createFromSharedObject( 382 getCpuRefImpl()->getContext(), 383 mScriptObj); 384 385#endif // RS_COMPATIBILITY_LIB 386} 387 388void CpuScriptGroup2Impl::execute() { 389 for (auto batch : mBatches) { 390 batch->setGlobalsForBatch(); 391 batch->run(); 392 } 393} 394 395void Batch::setGlobalsForBatch() { 396 for (CPUClosure* cpuClosure : mClosures) { 397 const Closure* closure = cpuClosure->mClosure; 398 const IDBase* funcID = closure->mFunctionID.get(); 399 Script* s = funcID->mScript;; 400 for (const auto& p : closure->mGlobals) { 401 const void* value = p.second.first; 402 int size = p.second.second; 403 if (value == nullptr && size == 0) { 404 // This indicates the current closure depends on another closure for a 405 // global in their shared module (script). In this case we don't need to 406 // copy the value. For example, an invoke intializes a global variable 407 // which a kernel later reads. 408 continue; 409 } 410 rsAssert(p.first != nullptr); 411 Script* script = p.first->mScript; 412 const RsdCpuScriptImpl *cpuScript = 413 (const RsdCpuScriptImpl*)script->mHal.drv; 414 int slot = p.first->mSlot; 415 ScriptExecutable* exec = mGroup->getExecutable(); 416 if (exec != nullptr) { 417 const char* varName = cpuScript->getFieldName(slot); 418 void* addr = exec->getFieldAddress(varName); 419 if (size < 0) { 420 rsrSetObject(mGroup->getCpuRefImpl()->getContext(), 421 (rs_object_base*)addr, (ObjectBase*)value); 422 } else { 423 memcpy(addr, (const void*)&value, size); 424 } 425 } else { 426 // We use -1 size to indicate an ObjectBase rather than a primitive type 427 if (size < 0) { 428 s->setVarObj(slot, (ObjectBase*)value); 429 } else { 430 s->setVar(slot, (const void*)&value, size); 431 } 432 } 433 } 434 } 435} 436 437void Batch::run() { 438 if (!mClosures.front()->mClosure->mIsKernel) { 439 rsAssert(mClosures.size() == 1); 440 441 // This batch contains a single closure for an invoke function 442 CPUClosure* cc = mClosures.front(); 443 const Closure* c = cc->mClosure; 444 445 if (mFunc != nullptr) { 446 // TODO: Need align pointers for x86_64. 447 // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp 448 ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength); 449 } else { 450 const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get(); 451 rsAssert(invokeID != nullptr); 452 cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength); 453 } 454 455 return; 456 } 457 458 if (mFunc != nullptr) { 459 MTLaunchStruct mtls; 460 const CPUClosure* firstCpuClosure = mClosures.front(); 461 const CPUClosure* lastCpuClosure = mClosures.back(); 462 463 firstCpuClosure->mSi->forEachMtlsSetup( 464 (const Allocation**)firstCpuClosure->mClosure->mArgs, 465 firstCpuClosure->mClosure->mNumArg, 466 lastCpuClosure->mClosure->mReturnValue, 467 nullptr, 0, nullptr, &mtls); 468 469 mtls.script = nullptr; 470 mtls.fep.usr = nullptr; 471 mtls.kernel = (ForEachFunc_t)mFunc; 472 473 mGroup->getCpuRefImpl()->launchThreads( 474 (const Allocation**)firstCpuClosure->mClosure->mArgs, 475 firstCpuClosure->mClosure->mNumArg, 476 lastCpuClosure->mClosure->mReturnValue, 477 nullptr, &mtls); 478 479 return; 480 } 481 482 for (CPUClosure* cpuClosure : mClosures) { 483 const Closure* closure = cpuClosure->mClosure; 484 const ScriptKernelID* kernelID = 485 (const ScriptKernelID*)closure->mFunctionID.get(); 486 cpuClosure->mSi->preLaunch(kernelID->mSlot, 487 (const Allocation**)closure->mArgs, 488 closure->mNumArg, closure->mReturnValue, 489 nullptr, 0, nullptr); 490 } 491 492 const CPUClosure* cpuClosure = mClosures.front(); 493 const Closure* closure = cpuClosure->mClosure; 494 MTLaunchStruct mtls; 495 496 if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs, 497 closure->mNumArg, 498 closure->mReturnValue, 499 nullptr, 0, nullptr, &mtls)) { 500 501 mtls.script = nullptr; 502 mtls.kernel = (void (*)())&groupRoot; 503 mtls.fep.usr = &mClosures; 504 505 mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls); 506 } 507 508 for (CPUClosure* cpuClosure : mClosures) { 509 const Closure* closure = cpuClosure->mClosure; 510 const ScriptKernelID* kernelID = 511 (const ScriptKernelID*)closure->mFunctionID.get(); 512 cpuClosure->mSi->postLaunch(kernelID->mSlot, 513 (const Allocation**)closure->mArgs, 514 closure->mNumArg, closure->mReturnValue, 515 nullptr, 0, nullptr); 516 } 517} 518 519} // namespace renderscript 520} // namespace android 521