rsCpuScriptGroup2.cpp revision 2abfcc6d129fe3defddef4540aa95cc445c03a7a
1#include "rsCpuScriptGroup2.h" 2 3#include <dlfcn.h> 4 5#include <string> 6#include <vector> 7 8#ifndef RS_COMPATIBILITY_LIB 9#include "bcc/Config/Config.h" 10#include <sys/wait.h> 11#endif 12 13#include "cpu_ref/rsCpuCore.h" 14#include "rsClosure.h" 15#include "rsContext.h" 16#include "rsCpuCore.h" 17#include "rsCpuExecutable.h" 18#include "rsCpuScript.h" 19#include "rsScript.h" 20#include "rsScriptGroup2.h" 21#include "rsScriptIntrinsic.h" 22 23using std::string; 24using std::vector; 25 26namespace android { 27namespace renderscript { 28 29namespace { 30 31const size_t DefaultKernelArgCount = 2; 32 33void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart, 34 uint32_t xend, uint32_t outstep) { 35 const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kparams->usr; 36 RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams; 37 const void **oldIns = kparams->ins; 38 uint32_t *oldStrides = kparams->inEStrides; 39 40 std::vector<const void*> ins(DefaultKernelArgCount); 41 std::vector<uint32_t> strides(DefaultKernelArgCount); 42 43 for (CPUClosure* cpuClosure : closures) { 44 const Closure* closure = cpuClosure->mClosure; 45 46 auto in_iter = ins.begin(); 47 auto stride_iter = strides.begin(); 48 49 for (size_t i = 0; i < closure->mNumArg; i++) { 50 const void* arg = closure->mArgs[i]; 51 const Allocation* a = (const Allocation*)arg; 52 const uint32_t eStride = a->mHal.state.elementSizeBytes; 53 const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) + 54 eStride * xstart; 55 if (kparams->dimY > 1) { 56 ptr += a->mHal.drvState.lod[0].stride * kparams->y; 57 } 58 *in_iter++ = ptr; 59 *stride_iter++ = eStride; 60 } 61 62 mutable_kparams->ins = &ins[0]; 63 mutable_kparams->inEStrides = &strides[0]; 64 65 const Allocation* out = closure->mReturnValue; 66 const uint32_t ostep = out->mHal.state.elementSizeBytes; 67 const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) + 68 ostep * xstart; 69 if (kparams->dimY > 1) { 70 ptr += out->mHal.drvState.lod[0].stride * kparams->y; 71 } 72 73 mutable_kparams->out = (void*)ptr; 74 75 mutable_kparams->usr = cpuClosure->mUsrPtr; 76 77 cpuClosure->mFunc(kparams, xstart, xend, ostep); 78 } 79 80 mutable_kparams->ins = oldIns; 81 mutable_kparams->inEStrides = oldStrides; 82 mutable_kparams->usr = &closures; 83} 84 85} // namespace 86 87Batch::~Batch() { 88 for (CPUClosure* c : mClosures) { 89 delete c; 90 } 91 if (mScriptObj) { 92 dlclose(mScriptObj); 93 } 94} 95 96bool Batch::conflict(CPUClosure* cpuClosure) const { 97 if (mClosures.empty()) { 98 return false; 99 } 100 101 const Closure* closure = cpuClosure->mClosure; 102 103 if (closure->mKernelID.get() == nullptr || 104 mClosures.front()->mClosure->mKernelID.get() == nullptr) { 105 // An invoke should be in a batch by itself, so it conflicts with any other 106 // closure. 107 return true; 108 } 109 110 const auto& globalDeps = closure->mGlobalDeps; 111 const auto& argDeps = closure->mArgDeps; 112 113 for (CPUClosure* c : mClosures) { 114 const Closure* batched = c->mClosure; 115 if (globalDeps.find(batched) != globalDeps.end()) { 116 return true; 117 } 118 const auto& it = argDeps.find(batched); 119 if (it != argDeps.end()) { 120 const auto& args = (*it).second; 121 for (const auto &p1 : *args) { 122 if (p1.second->get() != nullptr) { 123 return true; 124 } 125 } 126 } 127 } 128 129 return false; 130} 131 132CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, 133 const ScriptGroupBase *sg) : 134 mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)) { 135 rsAssert(!mGroup->mClosures.empty()); 136 137 Batch* batch = new Batch(this); 138 for (Closure* closure: mGroup->mClosures) { 139 const ScriptKernelID* kernelID = closure->mKernelID.get(); 140 RsdCpuScriptImpl* si; 141 CPUClosure* cc; 142 if (kernelID != nullptr) { 143 si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(kernelID->mScript); 144 MTLaunchStruct mtls; 145 si->forEachKernelSetup(kernelID->mSlot, &mtls); 146 // TODO: Is mtls.fep.usrLen ever used? 147 cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel, 148 mtls.fep.usr, mtls.fep.usrLen); 149 } else { 150 si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript( 151 closure->mInvokeID->mScript); 152 cc = new CPUClosure(closure, si); 153 } 154 155 if (batch->conflict(cc)) { 156 mBatches.push_back(batch); 157 batch = new Batch(this); 158 } 159 160 batch->mClosures.push_back(cc); 161 } 162 163 rsAssert(!batch->mClosures.empty()); 164 mBatches.push_back(batch); 165 166#ifndef RS_COMPATIBILITY_LIB 167 for (Batch* batch : mBatches) { 168 batch->tryToCreateFusedKernel(mGroup->mCacheDir); 169 } 170#endif 171} 172 173CpuScriptGroup2Impl::~CpuScriptGroup2Impl() { 174 for (Batch* batch : mBatches) { 175 delete batch; 176 } 177} 178 179namespace { 180 181#ifndef RS_COMPATIBILITY_LIB 182 183string getFileName(string path) { 184 unsigned found = path.find_last_of("/\\"); 185 return path.substr(found + 1); 186} 187 188void setupCompileArguments( 189 const vector<string>& inputs, const vector<int>& kernels, 190 const string& output_dir, const string& output_filename, 191 const string& rsLib, vector<const char*>* args) { 192 args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH); 193 args->push_back("-fPIC"); 194 args->push_back("-embedRSInfo"); 195 args->push_back("-mtriple"); 196 args->push_back(DEFAULT_TARGET_TRIPLE_STRING); 197 args->push_back("-bclib"); 198 args->push_back(rsLib.c_str()); 199 for (const string& input : inputs) { 200 args->push_back(input.c_str()); 201 } 202 for (int kernel : kernels) { 203 args->push_back("-k"); 204 string strKernel = std::to_string(kernel); 205 args->push_back(strKernel.c_str()); 206 } 207 args->push_back("-output_path"); 208 args->push_back(output_dir.c_str()); 209 args->push_back("-o"); 210 args->push_back(output_filename.c_str()); 211 args->push_back(nullptr); 212} 213 214bool fuseAndCompile(const char** arguments, 215 const string& commandLine) { 216 const pid_t pid = fork(); 217 218 if (pid == -1) { 219 ALOGE("Couldn't fork for bcc execution"); 220 return false; 221 } 222 223 if (pid == 0) { 224 // Child process 225 ALOGV("Invoking BCC with: %s", commandLine.c_str()); 226 execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments); 227 228 ALOGE("execv() failed: %s", strerror(errno)); 229 abort(); 230 return false; 231 } 232 233 // Parent process 234 int status = 0; 235 const pid_t w = waitpid(pid, &status, 0); 236 if (w == -1) { 237 return false; 238 } 239 240 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) { 241 ALOGE("bcc terminated unexpectedly"); 242 return false; 243 } 244 245 return true; 246} 247#endif 248 249} // anonymous namespace 250 251void Batch::tryToCreateFusedKernel(const char *cacheDir) { 252#ifndef RS_COMPATIBILITY_LIB 253 if (mClosures.size() < 2) { 254 return; 255 } 256 257 //===--------------------------------------------------------------------===// 258 // Fuse the input kernels and generate native code in an object file 259 //===--------------------------------------------------------------------===// 260 261 std::vector<string> inputFiles; 262 std::vector<int> slots; 263 264 for (CPUClosure* cpuClosure : mClosures) { 265 const Closure* closure = cpuClosure->mClosure; 266 const ScriptKernelID* kernelID = closure->mKernelID.get(); 267 const Script* script = kernelID->mScript; 268 269 if (script->isIntrinsic()) { 270 return; 271 } 272 273 const RsdCpuScriptImpl *cpuScript = 274 (const RsdCpuScriptImpl*)script->mHal.drv; 275 276 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 277 278 inputFiles.push_back(bitcodeFilename); 279 slots.push_back(kernelID->mSlot); 280 } 281 282 string outputPath(tempnam(cacheDir, "fused")); 283 string outputFileName = getFileName(outputPath); 284 string objFilePath(outputPath); 285 objFilePath.append(".o"); 286 string rsLibPath(SYSLIBPATH"/libclcore.bc"); 287 vector<const char*> arguments; 288 setupCompileArguments(inputFiles, slots, cacheDir, outputFileName, rsLibPath, 289 &arguments); 290 std::unique_ptr<const char> joined( 291 rsuJoinStrings(arguments.size() - 1, arguments.data())); 292 string commandLine (joined.get()); 293 294 if (!fuseAndCompile(arguments.data(), commandLine)) { 295 return; 296 } 297 298 //===--------------------------------------------------------------------===// 299 // Create and load the shared lib 300 //===--------------------------------------------------------------------===// 301 302 const char* resName = outputFileName.c_str(); 303 304 if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) { 305 ALOGE("Failed to link object file '%s'", resName); 306 return; 307 } 308 309 void* mSharedObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName); 310 if (mSharedObj == nullptr) { 311 ALOGE("Unable to load '%s'", resName); 312 return; 313 } 314 315 mExecutable = ScriptExecutable::createFromSharedObject( 316 nullptr, // RS context. Unused. 317 mSharedObj); 318 319#endif // RS_COMPATIBILITY_LIB 320} 321 322void CpuScriptGroup2Impl::execute() { 323 for (auto batch : mBatches) { 324 batch->setGlobalsForBatch(); 325 batch->run(); 326 } 327} 328 329void Batch::setGlobalsForBatch() { 330 for (CPUClosure* cpuClosure : mClosures) { 331 const Closure* closure = cpuClosure->mClosure; 332 const ScriptKernelID* kernelID = closure->mKernelID.get(); 333 Script* s; 334 if (kernelID != nullptr) { 335 s = kernelID->mScript; 336 } else { 337 s = cpuClosure->mClosure->mInvokeID->mScript; 338 } 339 for (const auto& p : closure->mGlobals) { 340 const void* value = p.second.first; 341 int size = p.second.second; 342 if (value == nullptr && size == 0) { 343 // This indicates the current closure depends on another closure for a 344 // global in their shared module (script). In this case we don't need to 345 // copy the value. For example, an invoke intializes a global variable 346 // which a kernel later reads. 347 continue; 348 } 349 rsAssert(p.first != nullptr); 350 ALOGV("Evaluating closure %p, setting field %p (Script %p, slot: %d)", 351 closure, p.first, p.first->mScript, p.first->mSlot); 352 // We use -1 size to indicate an ObjectBase rather than a primitive type 353 if (size < 0) { 354 s->setVarObj(p.first->mSlot, (ObjectBase*)value); 355 } else { 356 s->setVar(p.first->mSlot, (const void*)&value, size); 357 } 358 } 359 } 360} 361 362void Batch::run() { 363 if (mExecutable != nullptr) { 364 MTLaunchStruct mtls; 365 const CPUClosure* firstCpuClosure = mClosures.front(); 366 const CPUClosure* lastCpuClosure = mClosures.back(); 367 368 firstCpuClosure->mSi->forEachMtlsSetup( 369 (const Allocation**)firstCpuClosure->mClosure->mArgs, 370 firstCpuClosure->mClosure->mNumArg, 371 lastCpuClosure->mClosure->mReturnValue, 372 nullptr, 0, nullptr, &mtls); 373 374 mtls.script = nullptr; 375 mtls.fep.usr = nullptr; 376 mtls.kernel = mExecutable->getForEachFunction(0); 377 378 mGroup->getCpuRefImpl()->launchThreads( 379 (const Allocation**)firstCpuClosure->mClosure->mArgs, 380 firstCpuClosure->mClosure->mNumArg, 381 lastCpuClosure->mClosure->mReturnValue, 382 nullptr, &mtls); 383 384 return; 385 } 386 387 if (mClosures.size() == 1 && 388 mClosures.front()->mClosure->mKernelID.get() == nullptr) { 389 // This closure is for an invoke function 390 CPUClosure* cc = mClosures.front(); 391 const Closure* c = cc->mClosure; 392 const ScriptInvokeID* invokeID = c->mInvokeID; 393 rsAssert(invokeID != nullptr); 394 cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength); 395 return; 396 } 397 398 for (CPUClosure* cpuClosure : mClosures) { 399 const Closure* closure = cpuClosure->mClosure; 400 const ScriptKernelID* kernelID = closure->mKernelID.get(); 401 cpuClosure->mSi->preLaunch(kernelID->mSlot, 402 (const Allocation**)closure->mArgs, 403 closure->mNumArg, closure->mReturnValue, 404 cpuClosure->mUsrPtr, cpuClosure->mUsrSize, 405 nullptr); 406 } 407 408 const CPUClosure* cpuClosure = mClosures.front(); 409 const Closure* closure = cpuClosure->mClosure; 410 MTLaunchStruct mtls; 411 412 if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs, 413 closure->mNumArg, 414 closure->mReturnValue, 415 nullptr, 0, nullptr, &mtls)) { 416 417 mtls.script = nullptr; 418 mtls.kernel = (void (*)())&groupRoot; 419 mtls.fep.usr = &mClosures; 420 421 mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls); 422 } 423 424 for (CPUClosure* cpuClosure : mClosures) { 425 const Closure* closure = cpuClosure->mClosure; 426 const ScriptKernelID* kernelID = closure->mKernelID.get(); 427 cpuClosure->mSi->postLaunch(kernelID->mSlot, 428 (const Allocation**)closure->mArgs, 429 closure->mNumArg, closure->mReturnValue, 430 nullptr, 0, nullptr); 431 } 432} 433 434} // namespace renderscript 435} // namespace android 436