rsCpuScriptGroup2.cpp revision ff2bb54ebf593b1d19d3a2e4cfa70a8ea4432c0d
1#include "rsCpuScriptGroup2.h" 2 3#include <dlfcn.h> 4 5#include <string> 6#include <vector> 7 8#ifndef RS_COMPATIBILITY_LIB 9#include "bcc/Config/Config.h" 10#include <sys/wait.h> 11#endif 12 13#include "cpu_ref/rsCpuCore.h" 14#include "rsClosure.h" 15#include "rsContext.h" 16#include "rsCpuCore.h" 17#include "rsCpuScript.h" 18#include "rsScript.h" 19#include "rsScriptGroup2.h" 20#include "rsScriptIntrinsic.h" 21 22using std::string; 23using std::vector; 24 25namespace android { 26namespace renderscript { 27 28namespace { 29 30const size_t DefaultKernelArgCount = 2; 31 32void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart, 33 uint32_t xend, uint32_t outstep) { 34 const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kparams->usr; 35 RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams; 36 const void **oldIns = kparams->ins; 37 uint32_t *oldStrides = kparams->inEStrides; 38 39 std::vector<const void*> ins(DefaultKernelArgCount); 40 std::vector<uint32_t> strides(DefaultKernelArgCount); 41 42 for (CPUClosure* cpuClosure : closures) { 43 const Closure* closure = cpuClosure->mClosure; 44 45 auto in_iter = ins.begin(); 46 auto stride_iter = strides.begin(); 47 48 for (size_t i = 0; i < closure->mNumArg; i++) { 49 const void* arg = closure->mArgs[i]; 50 const Allocation* a = (const Allocation*)arg; 51 const uint32_t eStride = a->mHal.state.elementSizeBytes; 52 const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) + 53 eStride * xstart; 54 if (kparams->dimY > 1) { 55 ptr += a->mHal.drvState.lod[0].stride * kparams->y; 56 } 57 *in_iter++ = ptr; 58 *stride_iter++ = eStride; 59 } 60 61 mutable_kparams->ins = &ins[0]; 62 mutable_kparams->inEStrides = &strides[0]; 63 64 const Allocation* out = closure->mReturnValue; 65 const uint32_t ostep = out->mHal.state.elementSizeBytes; 66 const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) + 67 ostep * xstart; 68 if (kparams->dimY > 1) { 69 ptr += out->mHal.drvState.lod[0].stride * kparams->y; 70 } 71 72 mutable_kparams->out = (void*)ptr; 73 74 mutable_kparams->usr = cpuClosure->mUsrPtr; 75 76 cpuClosure->mFunc(kparams, xstart, xend, ostep); 77 } 78 79 mutable_kparams->ins = oldIns; 80 mutable_kparams->inEStrides = oldStrides; 81 mutable_kparams->usr = &closures; 82} 83 84} // namespace 85 86Batch::~Batch() { 87 for (CPUClosure* c : mClosures) { 88 delete c; 89 } 90 if (mScriptObj) { 91 dlclose(mScriptObj); 92 } 93} 94 95bool Batch::conflict(CPUClosure* cpuClosure) const { 96 if (mClosures.empty()) { 97 return false; 98 } 99 100 const Closure* closure = cpuClosure->mClosure; 101 102 if (closure->mKernelID.get() == nullptr || 103 mClosures.front()->mClosure->mKernelID.get() == nullptr) { 104 // An invoke should be in a batch by itself, so it conflicts with any other 105 // closure. 106 return true; 107 } 108 109 const auto& globalDeps = closure->mGlobalDeps; 110 const auto& argDeps = closure->mArgDeps; 111 112 for (CPUClosure* c : mClosures) { 113 const Closure* batched = c->mClosure; 114 if (globalDeps.find(batched) != globalDeps.end()) { 115 return true; 116 } 117 const auto& it = argDeps.find(batched); 118 if (it != argDeps.end()) { 119 const auto& args = (*it).second; 120 for (const auto &p1 : *args) { 121 if (p1.second->get() != nullptr) { 122 return true; 123 } 124 } 125 } 126 } 127 128 return false; 129} 130 131CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, 132 const ScriptGroupBase *sg) : 133 mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)) { 134 rsAssert(!mGroup->mClosures.empty()); 135 136 Batch* batch = new Batch(this); 137 for (Closure* closure: mGroup->mClosures) { 138 const ScriptKernelID* kernelID = closure->mKernelID.get(); 139 RsdCpuScriptImpl* si; 140 CPUClosure* cc; 141 if (kernelID != nullptr) { 142 si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(kernelID->mScript); 143 MTLaunchStruct mtls; 144 si->forEachKernelSetup(kernelID->mSlot, &mtls); 145 // TODO: Is mtls.fep.usrLen ever used? 146 cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel, 147 mtls.fep.usr, mtls.fep.usrLen); 148 } else { 149 si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript( 150 closure->mInvokeID->mScript); 151 cc = new CPUClosure(closure, si); 152 } 153 154 if (batch->conflict(cc)) { 155 mBatches.push_back(batch); 156 batch = new Batch(this); 157 } 158 159 batch->mClosures.push_back(cc); 160 } 161 162 rsAssert(!batch->mClosures.empty()); 163 mBatches.push_back(batch); 164 165#ifndef RS_COMPATIBILITY_LIB 166 for (Batch* batch : mBatches) { 167 batch->tryToCreateFusedKernel(mGroup->mCacheDir); 168 } 169#endif 170} 171 172CpuScriptGroup2Impl::~CpuScriptGroup2Impl() { 173 for (Batch* batch : mBatches) { 174 delete batch; 175 } 176} 177 178namespace { 179 180#ifndef RS_COMPATIBILITY_LIB 181 182string getFileName(string path) { 183 unsigned found = path.find_last_of("/\\"); 184 return path.substr(found + 1); 185} 186 187void setupCompileArguments( 188 const vector<string>& inputs, const vector<int>& kernels, 189 const string& output_dir, const string& output_filename, 190 const string& rsLib, vector<const char*>* args) { 191 args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH); 192 args->push_back("-fPIC"); 193 args->push_back("-embedRSInfo"); 194 args->push_back("-mtriple"); 195 args->push_back(DEFAULT_TARGET_TRIPLE_STRING); 196 args->push_back("-bclib"); 197 args->push_back(rsLib.c_str()); 198 for (const string& input : inputs) { 199 args->push_back(input.c_str()); 200 } 201 for (int kernel : kernels) { 202 args->push_back("-k"); 203 string strKernel = std::to_string(kernel); 204 args->push_back(strKernel.c_str()); 205 } 206 args->push_back("-output_path"); 207 args->push_back(output_dir.c_str()); 208 args->push_back("-o"); 209 args->push_back(output_filename.c_str()); 210 args->push_back(nullptr); 211} 212 213string convertListToString(int n, const char* const* strs) { 214 string ret; 215 ret.append(strs[0]); 216 for (int i = 1; i < n; i++) { 217 ret.append(" "); 218 ret.append(strs[i]); 219 } 220 return ret; 221} 222 223bool fuseAndCompile(const char** arguments, 224 const string& commandLine) { 225 const pid_t pid = fork(); 226 227 if (pid == -1) { 228 ALOGE("Couldn't fork for bcc execution"); 229 return false; 230 } 231 232 if (pid == 0) { 233 // Child process 234 ALOGV("Invoking BCC with: %s", commandLine.c_str()); 235 execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments); 236 237 ALOGE("execv() failed: %s", strerror(errno)); 238 abort(); 239 return false; 240 } 241 242 // Parent process 243 int status = 0; 244 const pid_t w = waitpid(pid, &status, 0); 245 if (w == -1) { 246 return false; 247 } 248 249 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) { 250 ALOGE("bcc terminated unexpectedly"); 251 return false; 252 } 253 254 return true; 255} 256#endif 257 258} // anonymous namespace 259 260void Batch::tryToCreateFusedKernel(const char *cacheDir) { 261#ifndef RS_COMPATIBILITY_LIB 262 if (mClosures.size() < 2) { 263 return; 264 } 265 266 //===--------------------------------------------------------------------===// 267 // Fuse the input kernels and generate native code in an object file 268 //===--------------------------------------------------------------------===// 269 270 std::vector<string> inputFiles; 271 std::vector<int> slots; 272 273 for (CPUClosure* cpuClosure : mClosures) { 274 const Closure* closure = cpuClosure->mClosure; 275 const ScriptKernelID* kernelID = closure->mKernelID.get(); 276 const Script* script = kernelID->mScript; 277 278 if (script->isIntrinsic()) { 279 return; 280 } 281 282 const RsdCpuScriptImpl *cpuScript = 283 (const RsdCpuScriptImpl*)script->mHal.drv; 284 285 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 286 287 inputFiles.push_back(bitcodeFilename); 288 slots.push_back(kernelID->mSlot); 289 } 290 291 string outputPath(tempnam(cacheDir, "fused")); 292 string outputFileName = getFileName(outputPath); 293 string objFilePath(outputPath); 294 objFilePath.append(".o"); 295 string rsLibPath(SYSLIBPATH"/libclcore.bc"); 296 vector<const char*> arguments; 297 setupCompileArguments(inputFiles, slots, cacheDir, outputFileName, rsLibPath, 298 &arguments); 299 string commandLine = 300 convertListToString(arguments.size() - 1, arguments.data()); 301 302 if (!fuseAndCompile(arguments.data(), commandLine)) { 303 return; 304 } 305 306 //===--------------------------------------------------------------------===// 307 // Create and load the shared lib 308 //===--------------------------------------------------------------------===// 309 310 const char* resName = outputFileName.c_str(); 311 312 if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) { 313 ALOGE("Failed to link object file '%s'", resName); 314 return; 315 } 316 317 void* mSharedObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName); 318 if (mSharedObj == nullptr) { 319 ALOGE("Unable to load '%s'", resName); 320 return; 321 } 322 323 mExecutable = ScriptExecutable::createFromSharedObject( 324 nullptr, // RS context. Unused. 325 mSharedObj); 326 327#endif // RS_COMPATIBILITY_LIB 328} 329 330void CpuScriptGroup2Impl::execute() { 331 for (auto batch : mBatches) { 332 batch->setGlobalsForBatch(); 333 batch->run(); 334 } 335} 336 337void Batch::setGlobalsForBatch() { 338 for (CPUClosure* cpuClosure : mClosures) { 339 const Closure* closure = cpuClosure->mClosure; 340 const ScriptKernelID* kernelID = closure->mKernelID.get(); 341 Script* s; 342 if (kernelID != nullptr) { 343 s = kernelID->mScript; 344 } else { 345 s = cpuClosure->mClosure->mInvokeID->mScript; 346 } 347 for (const auto& p : closure->mGlobals) { 348 const void* value = p.second.first; 349 int size = p.second.second; 350 if (value == nullptr && size == 0) { 351 // This indicates the current closure depends on another closure for a 352 // global in their shared module (script). In this case we don't need to 353 // copy the value. For example, an invoke intializes a global variable 354 // which a kernel later reads. 355 continue; 356 } 357 rsAssert(p.first != nullptr); 358 ALOGV("Evaluating closure %p, setting field %p (Script %p, slot: %d)", 359 closure, p.first, p.first->mScript, p.first->mSlot); 360 // We use -1 size to indicate an ObjectBase rather than a primitive type 361 if (size < 0) { 362 s->setVarObj(p.first->mSlot, (ObjectBase*)value); 363 } else { 364 s->setVar(p.first->mSlot, (const void*)&value, size); 365 } 366 } 367 } 368} 369 370void Batch::run() { 371 if (mExecutable != nullptr) { 372 MTLaunchStruct mtls; 373 const CPUClosure* firstCpuClosure = mClosures.front(); 374 const CPUClosure* lastCpuClosure = mClosures.back(); 375 376 firstCpuClosure->mSi->forEachMtlsSetup( 377 (const Allocation**)firstCpuClosure->mClosure->mArgs, 378 firstCpuClosure->mClosure->mNumArg, 379 lastCpuClosure->mClosure->mReturnValue, 380 nullptr, 0, nullptr, &mtls); 381 382 mtls.script = nullptr; 383 mtls.fep.usr = nullptr; 384 mtls.kernel = mExecutable->getForEachFunction(0); 385 386 mGroup->getCpuRefImpl()->launchThreads( 387 (const Allocation**)firstCpuClosure->mClosure->mArgs, 388 firstCpuClosure->mClosure->mNumArg, 389 lastCpuClosure->mClosure->mReturnValue, 390 nullptr, &mtls); 391 392 return; 393 } 394 395 if (mClosures.size() == 1 && 396 mClosures.front()->mClosure->mKernelID.get() == nullptr) { 397 // This closure is for an invoke function 398 CPUClosure* cc = mClosures.front(); 399 const Closure* c = cc->mClosure; 400 const ScriptInvokeID* invokeID = c->mInvokeID; 401 rsAssert(invokeID != nullptr); 402 cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength); 403 return; 404 } 405 406 for (CPUClosure* cpuClosure : mClosures) { 407 const Closure* closure = cpuClosure->mClosure; 408 const ScriptKernelID* kernelID = closure->mKernelID.get(); 409 cpuClosure->mSi->preLaunch(kernelID->mSlot, 410 (const Allocation**)closure->mArgs, 411 closure->mNumArg, closure->mReturnValue, 412 cpuClosure->mUsrPtr, cpuClosure->mUsrSize, 413 nullptr); 414 } 415 416 const CPUClosure* cpuClosure = mClosures.front(); 417 const Closure* closure = cpuClosure->mClosure; 418 MTLaunchStruct mtls; 419 420 if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs, 421 closure->mNumArg, 422 closure->mReturnValue, 423 nullptr, 0, nullptr, &mtls)) { 424 425 mtls.script = nullptr; 426 mtls.kernel = (void (*)())&groupRoot; 427 mtls.fep.usr = &mClosures; 428 429 mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls); 430 } 431 432 for (CPUClosure* cpuClosure : mClosures) { 433 const Closure* closure = cpuClosure->mClosure; 434 const ScriptKernelID* kernelID = closure->mKernelID.get(); 435 cpuClosure->mSi->postLaunch(kernelID->mSlot, 436 (const Allocation**)closure->mArgs, 437 closure->mNumArg, closure->mReturnValue, 438 nullptr, 0, nullptr); 439 } 440} 441 442} // namespace renderscript 443} // namespace android 444