1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Library for getting system information during TensorFlow tests.""" 16 17from __future__ import absolute_import 18from __future__ import division 19from __future__ import print_function 20 21import ctypes as ct 22import platform 23 24from tensorflow.core.util import test_log_pb2 25from tensorflow.python.framework import errors 26from tensorflow.python.platform import gfile 27 28 29def _gather_gpu_devices_proc(): 30 """Try to gather NVidia GPU device information via /proc/driver.""" 31 dev_info = [] 32 for f in gfile.Glob("/proc/driver/nvidia/gpus/*/information"): 33 bus_id = f.split("/")[5] 34 key_values = dict(line.rstrip().replace("\t", "").split(":", 1) 35 for line in gfile.GFile(f, "r")) 36 key_values = dict((k.lower(), v.strip(" ").rstrip(" ")) 37 for (k, v) in key_values.items()) 38 info = test_log_pb2.GPUInfo() 39 info.model = key_values.get("model", "Unknown") 40 info.uuid = key_values.get("gpu uuid", "Unknown") 41 info.bus_id = bus_id 42 dev_info.append(info) 43 return dev_info 44 45 46class CUDADeviceProperties(ct.Structure): 47 # See $CUDA_HOME/include/cuda_runtime_api.h for the definition of 48 # the cudaDeviceProp struct. 49 _fields_ = [ 50 ("name", ct.c_char * 256), 51 ("totalGlobalMem", ct.c_size_t), 52 ("sharedMemPerBlock", ct.c_size_t), 53 ("regsPerBlock", ct.c_int), 54 ("warpSize", ct.c_int), 55 ("memPitch", ct.c_size_t), 56 ("maxThreadsPerBlock", ct.c_int), 57 ("maxThreadsDim", ct.c_int * 3), 58 ("maxGridSize", ct.c_int * 3), 59 ("clockRate", ct.c_int), 60 ("totalConstMem", ct.c_size_t), 61 ("major", ct.c_int), 62 ("minor", ct.c_int), 63 ("textureAlignment", ct.c_size_t), 64 ("texturePitchAlignment", ct.c_size_t), 65 ("deviceOverlap", ct.c_int), 66 ("multiProcessorCount", ct.c_int), 67 ("kernelExecTimeoutEnabled", ct.c_int), 68 ("integrated", ct.c_int), 69 ("canMapHostMemory", ct.c_int), 70 ("computeMode", ct.c_int), 71 ("maxTexture1D", ct.c_int), 72 ("maxTexture1DMipmap", ct.c_int), 73 ("maxTexture1DLinear", ct.c_int), 74 ("maxTexture2D", ct.c_int * 2), 75 ("maxTexture2DMipmap", ct.c_int * 2), 76 ("maxTexture2DLinear", ct.c_int * 3), 77 ("maxTexture2DGather", ct.c_int * 2), 78 ("maxTexture3D", ct.c_int * 3), 79 ("maxTexture3DAlt", ct.c_int * 3), 80 ("maxTextureCubemap", ct.c_int), 81 ("maxTexture1DLayered", ct.c_int * 2), 82 ("maxTexture2DLayered", ct.c_int * 3), 83 ("maxTextureCubemapLayered", ct.c_int * 2), 84 ("maxSurface1D", ct.c_int), 85 ("maxSurface2D", ct.c_int * 2), 86 ("maxSurface3D", ct.c_int * 3), 87 ("maxSurface1DLayered", ct.c_int * 2), 88 ("maxSurface2DLayered", ct.c_int * 3), 89 ("maxSurfaceCubemap", ct.c_int), 90 ("maxSurfaceCubemapLayered", ct.c_int * 2), 91 ("surfaceAlignment", ct.c_size_t), 92 ("concurrentKernels", ct.c_int), 93 ("ECCEnabled", ct.c_int), 94 ("pciBusID", ct.c_int), 95 ("pciDeviceID", ct.c_int), 96 ("pciDomainID", ct.c_int), 97 ("tccDriver", ct.c_int), 98 ("asyncEngineCount", ct.c_int), 99 ("unifiedAddressing", ct.c_int), 100 ("memoryClockRate", ct.c_int), 101 ("memoryBusWidth", ct.c_int), 102 ("l2CacheSize", ct.c_int), 103 ("maxThreadsPerMultiProcessor", ct.c_int), 104 ("streamPrioritiesSupported", ct.c_int), 105 ("globalL1CacheSupported", ct.c_int), 106 ("localL1CacheSupported", ct.c_int), 107 ("sharedMemPerMultiprocessor", ct.c_size_t), 108 ("regsPerMultiprocessor", ct.c_int), 109 ("managedMemSupported", ct.c_int), 110 ("isMultiGpuBoard", ct.c_int), 111 ("multiGpuBoardGroupID", ct.c_int), 112 # Pad with extra space to avoid dereference crashes if future 113 # versions of CUDA extend the size of this struct. 114 ("__future_buffer", ct.c_char * 4096) 115 ] 116 117 118def _gather_gpu_devices_cudart(): 119 """Try to gather NVidia GPU device information via libcudart.""" 120 dev_info = [] 121 122 system = platform.system() 123 if system == "Linux": 124 libcudart = ct.cdll.LoadLibrary("libcudart.so") 125 elif system == "Darwin": 126 libcudart = ct.cdll.LoadLibrary("libcudart.dylib") 127 elif system == "Windows": 128 libcudart = ct.windll.LoadLibrary("libcudart.dll") 129 else: 130 raise NotImplementedError("Cannot identify system.") 131 132 version = ct.c_int() 133 rc = libcudart.cudaRuntimeGetVersion(ct.byref(version)) 134 if rc != 0: 135 raise ValueError("Could not get version") 136 if version.value < 6050: 137 raise NotImplementedError("CUDA version must be between >= 6.5") 138 139 device_count = ct.c_int() 140 libcudart.cudaGetDeviceCount(ct.byref(device_count)) 141 142 for i in range(device_count.value): 143 properties = CUDADeviceProperties() 144 rc = libcudart.cudaGetDeviceProperties(ct.byref(properties), i) 145 if rc != 0: 146 raise ValueError("Could not get device properties") 147 pci_bus_id = " " * 13 148 rc = libcudart.cudaDeviceGetPCIBusId(ct.c_char_p(pci_bus_id), 13, i) 149 if rc != 0: 150 raise ValueError("Could not get device PCI bus id") 151 152 info = test_log_pb2.GPUInfo() # No UUID available 153 info.model = properties.name 154 info.bus_id = pci_bus_id 155 dev_info.append(info) 156 157 del properties 158 159 return dev_info 160 161 162def gather_gpu_devices(): 163 """Gather gpu device info. 164 165 Returns: 166 A list of test_log_pb2.GPUInfo messages. 167 """ 168 try: 169 # Prefer using /proc if possible, it provides the UUID. 170 dev_info = _gather_gpu_devices_proc() 171 if not dev_info: 172 raise ValueError("No devices found") 173 return dev_info 174 except (IOError, ValueError, errors.OpError): 175 pass 176 177 try: 178 # Fall back on using libcudart 179 return _gather_gpu_devices_cudart() 180 except (OSError, ValueError, NotImplementedError, errors.OpError): 181 return [] 182