1# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Library for getting system information during TensorFlow tests."""
16
17from __future__ import absolute_import
18from __future__ import division
19from __future__ import print_function
20
21import ctypes as ct
22import platform
23
24from tensorflow.core.util import test_log_pb2
25from tensorflow.python.framework import errors
26from tensorflow.python.platform import gfile
27
28
29def _gather_gpu_devices_proc():
30  """Try to gather NVidia GPU device information via /proc/driver."""
31  dev_info = []
32  for f in gfile.Glob("/proc/driver/nvidia/gpus/*/information"):
33    bus_id = f.split("/")[5]
34    key_values = dict(line.rstrip().replace("\t", "").split(":", 1)
35                      for line in gfile.GFile(f, "r"))
36    key_values = dict((k.lower(), v.strip(" ").rstrip(" "))
37                      for (k, v) in key_values.items())
38    info = test_log_pb2.GPUInfo()
39    info.model = key_values.get("model", "Unknown")
40    info.uuid = key_values.get("gpu uuid", "Unknown")
41    info.bus_id = bus_id
42    dev_info.append(info)
43  return dev_info
44
45
46class CUDADeviceProperties(ct.Structure):
47  # See $CUDA_HOME/include/cuda_runtime_api.h for the definition of
48  # the cudaDeviceProp struct.
49  _fields_ = [
50      ("name", ct.c_char * 256),
51      ("totalGlobalMem", ct.c_size_t),
52      ("sharedMemPerBlock", ct.c_size_t),
53      ("regsPerBlock", ct.c_int),
54      ("warpSize", ct.c_int),
55      ("memPitch", ct.c_size_t),
56      ("maxThreadsPerBlock", ct.c_int),
57      ("maxThreadsDim", ct.c_int * 3),
58      ("maxGridSize", ct.c_int * 3),
59      ("clockRate", ct.c_int),
60      ("totalConstMem", ct.c_size_t),
61      ("major", ct.c_int),
62      ("minor", ct.c_int),
63      ("textureAlignment", ct.c_size_t),
64      ("texturePitchAlignment", ct.c_size_t),
65      ("deviceOverlap", ct.c_int),
66      ("multiProcessorCount", ct.c_int),
67      ("kernelExecTimeoutEnabled", ct.c_int),
68      ("integrated", ct.c_int),
69      ("canMapHostMemory", ct.c_int),
70      ("computeMode", ct.c_int),
71      ("maxTexture1D", ct.c_int),
72      ("maxTexture1DMipmap", ct.c_int),
73      ("maxTexture1DLinear", ct.c_int),
74      ("maxTexture2D", ct.c_int * 2),
75      ("maxTexture2DMipmap", ct.c_int * 2),
76      ("maxTexture2DLinear", ct.c_int * 3),
77      ("maxTexture2DGather", ct.c_int * 2),
78      ("maxTexture3D", ct.c_int * 3),
79      ("maxTexture3DAlt", ct.c_int * 3),
80      ("maxTextureCubemap", ct.c_int),
81      ("maxTexture1DLayered", ct.c_int * 2),
82      ("maxTexture2DLayered", ct.c_int * 3),
83      ("maxTextureCubemapLayered", ct.c_int * 2),
84      ("maxSurface1D", ct.c_int),
85      ("maxSurface2D", ct.c_int * 2),
86      ("maxSurface3D", ct.c_int * 3),
87      ("maxSurface1DLayered", ct.c_int * 2),
88      ("maxSurface2DLayered", ct.c_int * 3),
89      ("maxSurfaceCubemap", ct.c_int),
90      ("maxSurfaceCubemapLayered", ct.c_int * 2),
91      ("surfaceAlignment", ct.c_size_t),
92      ("concurrentKernels", ct.c_int),
93      ("ECCEnabled", ct.c_int),
94      ("pciBusID", ct.c_int),
95      ("pciDeviceID", ct.c_int),
96      ("pciDomainID", ct.c_int),
97      ("tccDriver", ct.c_int),
98      ("asyncEngineCount", ct.c_int),
99      ("unifiedAddressing", ct.c_int),
100      ("memoryClockRate", ct.c_int),
101      ("memoryBusWidth", ct.c_int),
102      ("l2CacheSize", ct.c_int),
103      ("maxThreadsPerMultiProcessor", ct.c_int),
104      ("streamPrioritiesSupported", ct.c_int),
105      ("globalL1CacheSupported", ct.c_int),
106      ("localL1CacheSupported", ct.c_int),
107      ("sharedMemPerMultiprocessor", ct.c_size_t),
108      ("regsPerMultiprocessor", ct.c_int),
109      ("managedMemSupported", ct.c_int),
110      ("isMultiGpuBoard", ct.c_int),
111      ("multiGpuBoardGroupID", ct.c_int),
112      # Pad with extra space to avoid dereference crashes if future
113      # versions of CUDA extend the size of this struct.
114      ("__future_buffer", ct.c_char * 4096)
115  ]
116
117
118def _gather_gpu_devices_cudart():
119  """Try to gather NVidia GPU device information via libcudart."""
120  dev_info = []
121
122  system = platform.system()
123  if system == "Linux":
124    libcudart = ct.cdll.LoadLibrary("libcudart.so")
125  elif system == "Darwin":
126    libcudart = ct.cdll.LoadLibrary("libcudart.dylib")
127  elif system == "Windows":
128    libcudart = ct.windll.LoadLibrary("libcudart.dll")
129  else:
130    raise NotImplementedError("Cannot identify system.")
131
132  version = ct.c_int()
133  rc = libcudart.cudaRuntimeGetVersion(ct.byref(version))
134  if rc != 0:
135    raise ValueError("Could not get version")
136  if version.value < 6050:
137    raise NotImplementedError("CUDA version must be between >= 6.5")
138
139  device_count = ct.c_int()
140  libcudart.cudaGetDeviceCount(ct.byref(device_count))
141
142  for i in range(device_count.value):
143    properties = CUDADeviceProperties()
144    rc = libcudart.cudaGetDeviceProperties(ct.byref(properties), i)
145    if rc != 0:
146      raise ValueError("Could not get device properties")
147    pci_bus_id = " " * 13
148    rc = libcudart.cudaDeviceGetPCIBusId(ct.c_char_p(pci_bus_id), 13, i)
149    if rc != 0:
150      raise ValueError("Could not get device PCI bus id")
151
152    info = test_log_pb2.GPUInfo()  # No UUID available
153    info.model = properties.name
154    info.bus_id = pci_bus_id
155    dev_info.append(info)
156
157    del properties
158
159  return dev_info
160
161
162def gather_gpu_devices():
163  """Gather gpu device info.
164
165  Returns:
166    A list of test_log_pb2.GPUInfo messages.
167  """
168  try:
169    # Prefer using /proc if possible, it provides the UUID.
170    dev_info = _gather_gpu_devices_proc()
171    if not dev_info:
172      raise ValueError("No devices found")
173    return dev_info
174  except (IOError, ValueError, errors.OpError):
175    pass
176
177  try:
178    # Fall back on using libcudart
179    return _gather_gpu_devices_cudart()
180  except (OSError, ValueError, NotImplementedError, errors.OpError):
181    return []
182