19ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 29ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 39ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiLicensed under the Apache License, Version 2.0 (the "License"); 49ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caiyou may not use this file except in compliance with the License. 59ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiYou may obtain a copy of the License at 69ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 79ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai http://www.apache.org/licenses/LICENSE-2.0 89ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 99ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiUnless required by applicable law or agreed to in writing, software 109ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caidistributed under the License is distributed on an "AS IS" BASIS, 119ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 129ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiSee the License for the specific language governing permissions and 139ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cailimitations under the License. 149ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai==============================================================================*/ 159ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 169ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/debug/debug_io_utils.h" 179ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 183c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <stddef.h> 193c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <string.h> 203c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <cmath> 213c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <limits> 223c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <utility> 239ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include <vector> 249ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 2541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 26ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai#include "grpc++/create_channel.h" 2741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#else 281cb96893a64f59b7265f9def9968f7bed1e57662Andrew Harp// winsock2.h is used in grpc, so Ws2_32.lib is needed 29c902ec6bf03aa1612c81ae65beb4cc3eef190ff4A. Unique TensorFlower#pragma comment(lib, "Ws2_32.lib") 3041803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#endif // #ifndef PLATFORM_WINDOWS 311cb96893a64f59b7265f9def9968f7bed1e57662Andrew Harp 325ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower#include "tensorflow/core/debug/debug_callback_registry.h" 331e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower#include "tensorflow/core/debug/debugger_event_metadata.pb.h" 34e85d3df92deb9d717befdf173966a2913ac2aea0Geoffrey Irving#include "tensorflow/core/framework/graph.pb.h" 359ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/framework/summary.pb.h" 36c902ec6bf03aa1612c81ae65beb4cc3eef190ff4A. Unique TensorFlower#include "tensorflow/core/framework/tensor_shape.pb.h" 373f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#include "tensorflow/core/lib/core/bits.h" 3812ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai#include "tensorflow/core/lib/hash/hash.h" 399ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/lib/io/path.h" 409ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/lib/strings/str_util.h" 419142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai#include "tensorflow/core/lib/strings/stringprintf.h" 421e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower#include "tensorflow/core/platform/protobuf.h" 439ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/util/event.pb.h" 449ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 4541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#define GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR \ 4641803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai return errors::Unimplemented( \ 4741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai kGrpcURLScheme, " debug URL scheme is not implemented on Windows yet.") 48d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai 499ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cainamespace tensorflow { 509ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 519ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cainamespace { 529ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 533f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Creates an Event proto representing a chunk of a Tensor. This method only 543f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// populates the field of the Event proto that represent the envelope 553f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// informaion (e.g., timestmap, device_name, num_chunks, chunk_index, dtype, 563f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// shape). It does not set the value.tensor field, which should be set by the 573f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// caller separately. 583f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing CaiEvent PrepareChunkEventProto(const DebugNodeKey& debug_node_key, 593f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const uint64 wall_time_us, const size_t num_chunks, 603f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t chunk_index, 613f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const DataType& tensor_dtype, 623f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const TensorShapeProto& tensor_shape) { 639ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai Event event; 649ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai event.set_wall_time(static_cast<double>(wall_time_us)); 653f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai Summary::Value* value = event.mutable_summary()->add_value(); 669ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 679ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // Create the debug node_name in the Summary proto. 689ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // For example, if tensor_name = "foo/node_a:0", and the debug_op is 699ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // "DebugIdentity", the debug node_name in the Summary proto will be 709ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // "foo/node_a:0:DebugIdentity". 713f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai value->set_node_name(debug_node_key.debug_node_name); 721e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower 733f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // Tag by the node name. This allows TensorBoard to quickly fetch data 743f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // per op. 753f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai value->set_tag(debug_node_key.node_name); 761e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower 771e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower // Store data within debugger metadata to be stored for each event. 781e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower third_party::tensorflow::core::debug::DebuggerEventMetadata metadata; 791e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower metadata.set_device(debug_node_key.device_name); 801e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower metadata.set_output_slot(debug_node_key.output_slot); 813f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai metadata.set_num_chunks(num_chunks); 823f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai metadata.set_chunk_index(chunk_index); 831e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower 841e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower // Encode the data in JSON. 851e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower string json_output; 861e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower tensorflow::protobuf::util::JsonPrintOptions json_options; 871e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower json_options.always_print_primitive_fields = true; 881e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower auto status = tensorflow::protobuf::util::MessageToJsonString( 891e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower metadata, &json_output, json_options); 901e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower if (status.ok()) { 911e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower // Store summary metadata. Set the plugin to use this data as "debugger". 921e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower SummaryMetadata::PluginData* plugin_data = 934c60c96257bfd54a036d15af979e90fc0b4e400dA. Unique TensorFlower value->mutable_metadata()->mutable_plugin_data(); 943f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai plugin_data->set_plugin_name(DebugIO::kDebuggerPluginName); 951e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower plugin_data->set_content(json_output); 961e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower } else { 971e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower LOG(WARNING) << "Failed to convert DebuggerEventMetadata proto to JSON. " 981e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower << "The debug_node_name is " << debug_node_key.debug_node_name 991e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower << "."; 1001e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower } 1019ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 1023f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai value->mutable_tensor()->set_dtype(tensor_dtype); 1033f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai *value->mutable_tensor()->mutable_tensor_shape() = tensor_shape; 1043f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1053f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return event; 1063f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai} 1073f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1083f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Translates the length of a string to number of bytes when the string is 1093f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// encoded as bytes in protobuf. Note that this makes a conservative estimate 1103f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// (i.e., an estimate that is usually too large, but never too small under the 1113f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// gRPC message size limit) of the Varint-encoded length, to workaround the lack 1123f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// of a portable length function. 1133f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst size_t StringValMaxBytesInProto(const string& str) { 1143f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#if defined(PLATFORM_GOOGLE) 1153f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return str.size() + DebugGrpcIO::kGrpcMaxVarintLengthSize; 1163f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#else 1173f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return str.size(); 1183f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#endif 1193f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai} 1203f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1213f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Breaks a string Tensor (represented as a TensorProto) as a vector of Event 1223f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// protos. 1233f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing CaiStatus WrapStringTensorAsEvents(const DebugNodeKey& debug_node_key, 1243f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const uint64 wall_time_us, 1253f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t chunk_size_limit, 1263f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai TensorProto* tensor_proto, 1273f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai std::vector<Event>* events) { 1283f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const protobuf::RepeatedPtrField<string>& strs = tensor_proto->string_val(); 1293f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t num_strs = strs.size(); 1303f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t chunk_size_ub = chunk_size_limit > 0 1313f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai ? chunk_size_limit 1323f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai : std::numeric_limits<size_t>::max(); 1333f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1343f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // E.g., if cutoffs is {j, k, l}, the chunks will have index ranges: 1353f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // [0:a), [a:b), [c:<end>]. 1363f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai std::vector<size_t> cutoffs; 1373f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai size_t chunk_size = 0; 1383f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai for (size_t i = 0; i < num_strs; ++i) { 1393f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // Take into account the extra bytes in proto buffer. 1403f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai if (StringValMaxBytesInProto(strs[i]) > chunk_size_ub) { 1413f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return errors::FailedPrecondition( 1423f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai "string value at index ", i, " from debug node ", 1433f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai debug_node_key.debug_node_name, 1443f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai " does not fit gRPC message size limit (", chunk_size_ub, ")"); 1453f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 1463f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai if (chunk_size + StringValMaxBytesInProto(strs[i]) > chunk_size_ub) { 1473f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai cutoffs.push_back(i); 1483f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai chunk_size = 0; 1493f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 1503f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai chunk_size += StringValMaxBytesInProto(strs[i]); 1513f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 1523f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai cutoffs.push_back(num_strs); 1533f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t num_chunks = cutoffs.size(); 1543f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1553f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai for (size_t i = 0; i < num_chunks; ++i) { 1563f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai Event event = PrepareChunkEventProto(debug_node_key, wall_time_us, 1573f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai num_chunks, i, tensor_proto->dtype(), 1583f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai tensor_proto->tensor_shape()); 1593f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai Summary::Value* value = event.mutable_summary()->mutable_value(0); 1603f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1613f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai if (cutoffs.size() == 1) { 1623f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai value->mutable_tensor()->mutable_string_val()->Swap( 1633f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai tensor_proto->mutable_string_val()); 1643f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } else { 1653f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t begin = (i == 0) ? 0 : cutoffs[i - 1]; 1663f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t end = cutoffs[i]; 1673f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai for (size_t j = begin; j < end; ++j) { 1683f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai value->mutable_tensor()->add_string_val(strs[j]); 1693f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 1703f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 1713f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1723f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai events->push_back(std::move(event)); 1733f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 1743f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1753f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return Status::OK(); 1763f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai} 1773f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1783f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Encapsulates the tensor value inside a vector of Event protos. Large tensors 1793f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// are broken up to multiple protos to fit the chunk_size_limit. In each Event 1803f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// proto the field summary.tensor carries the content of the tensor. 1813f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// If chunk_size_limit <= 0, the tensor will not be broken into chunks, i.e., a 1823f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// length-1 vector will be returned, regardless of the size of the tensor. 1833f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing CaiStatus WrapTensorAsEvents(const DebugNodeKey& debug_node_key, 1843f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const Tensor& tensor, const uint64 wall_time_us, 1853f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t chunk_size_limit, 1863f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai std::vector<Event>* events) { 1873f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai TensorProto tensor_proto; 1889ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (tensor.dtype() == DT_STRING) { 1893f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // Treat DT_STRING specially, so that tensor_util.MakeNdarray in Python can 1903f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // convert the TensorProto to string-type numpy array. MakeNdarray does not 1913f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // work with strings encoded by AsProtoTensorContent() in tensor_content. 1923f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai tensor.AsProtoField(&tensor_proto); 1933f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1943f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai TF_RETURN_IF_ERROR(WrapStringTensorAsEvents( 1953f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai debug_node_key, wall_time_us, chunk_size_limit, &tensor_proto, events)); 1969ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } else { 1973f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai tensor.AsProtoTensorContent(&tensor_proto); 1983f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1993f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t total_length = tensor_proto.tensor_content().size(); 2003f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t chunk_size_ub = 2013f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai chunk_size_limit > 0 ? chunk_size_limit : total_length; 2023f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t num_chunks = 2033f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai (total_length == 0) 2043f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai ? 1 2053f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai : (total_length + chunk_size_ub - 1) / chunk_size_ub; 2063f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai for (size_t i = 0; i < num_chunks; ++i) { 2073f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t pos = i * chunk_size_ub; 2083f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t len = 2093f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai (i == num_chunks - 1) ? (total_length - pos) : chunk_size_ub; 2103f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai Event event = PrepareChunkEventProto(debug_node_key, wall_time_us, 2113f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai num_chunks, i, tensor_proto.dtype(), 2123f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai tensor_proto.tensor_shape()); 2133f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai event.mutable_summary() 2143f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai ->mutable_value(0) 2153f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai ->mutable_tensor() 2163f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai ->set_tensor_content(tensor_proto.tensor_content().substr(pos, len)); 2173f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai events->push_back(std::move(event)); 2183f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 2199ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 2209ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 2213f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return Status::OK(); 2229ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} 2239ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 2243f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Appends an underscore and a timestamp to a file path. If the path already 2259142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// exists on the file system, append a hyphen and a 1-up index. Consecutive 2269142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// values of the index will be tried until the first unused one is found. 2279142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// TOCTOU race condition is not of concern here due to the fact that tfdbg 2289142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// sets parallel_iterations attribute of all while_loops to 1 to prevent 2299142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// the same node from between executed multiple times concurrently. 2309142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Caistring AppendTimestampToFilePath(const string& in, const uint64 timestamp) { 2319142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai string out = strings::StrCat(in, "_", timestamp); 2329142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai 2339142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai uint64 i = 1; 2349142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai while (Env::Default()->FileExists(out).ok()) { 2359142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai out = strings::StrCat(in, "_", timestamp, "-", i); 2369142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai ++i; 2379142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai } 2389142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai return out; 2399142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai} 2409142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai 24141803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 2423f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Publishes encoded GraphDef through a gRPC debugger stream, in chunks, 2433f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// conforming to the gRPC message size limit. 24412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing CaiStatus PublishEncodedGraphDefInChunks(const string& encoded_graph_def, 24512ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai const string& device_name, 24612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai const int64 wall_time, 24712ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai const string& debug_url) { 24812ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai const uint64 hash = ::tensorflow::Hash64(encoded_graph_def); 24912ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai const size_t total_length = encoded_graph_def.size(); 2503f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t num_chunks = 2513f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai static_cast<size_t>(std::ceil(static_cast<float>(total_length) / 2523f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai DebugGrpcIO::kGrpcMessageSizeLimitBytes)); 25312ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai for (size_t i = 0; i < num_chunks; ++i) { 2543f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t pos = i * DebugGrpcIO::kGrpcMessageSizeLimitBytes; 2553f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t len = (i == num_chunks - 1) 2563f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai ? (total_length - pos) 2573f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai : DebugGrpcIO::kGrpcMessageSizeLimitBytes; 25812ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai Event event; 25912ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai event.set_wall_time(static_cast<double>(wall_time)); 26012ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai // Prefix the chunk with 26112ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai // <hash64>,<device_name>,<wall_time>|<index>|<num_chunks>|. 2623f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // TODO(cais): Use DebuggerEventMetadata to store device_name, num_chunks 2633f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // and chunk_index, instead. 26412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai event.set_graph_def(strings::StrCat(hash, ",", device_name, ",", wall_time, 26512ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai "|", i, "|", num_chunks, "|", 26612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai encoded_graph_def.substr(pos, len))); 267ecae82d1343df293fa36e67949e5404111817110Shanqing Cai const Status s = DebugGrpcIO::SendEventProtoThroughGrpcStream( 268ecae82d1343df293fa36e67949e5404111817110Shanqing Cai event, debug_url, num_chunks - 1 == i); 269ecae82d1343df293fa36e67949e5404111817110Shanqing Cai if (!s.ok()) { 27012ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai return errors::FailedPrecondition( 27112ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai "Failed to send chunk ", i, " of ", num_chunks, 272ecae82d1343df293fa36e67949e5404111817110Shanqing Cai " of encoded GraphDef of size ", encoded_graph_def.size(), " bytes, ", 273ecae82d1343df293fa36e67949e5404111817110Shanqing Cai "due to: ", s.error_message()); 27412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai } 27512ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai } 27612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai return Status::OK(); 27712ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai} 27841803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#endif // #ifndef PLATFORM_WINDOWS 27912ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai 2809ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} // namespace 2819ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 2823f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst char* const DebugIO::kDebuggerPluginName = "debugger"; 2833f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 284cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst char* const DebugIO::kCoreMetadataTag = "core_metadata_"; 285cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai 286cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst char* const DebugIO::kGraphTag = "graph_"; 287cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai 288525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Caiconst char* const DebugIO::kHashTag = "hash"; 289525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai 290ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing CaiStatus ReadEventFromFile(const string& dump_file_path, Event* event) { 291ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai Env* env(Env::Default()); 292ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 293ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai string content; 294ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai uint64 file_size = 0; 295ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 296ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai Status s = env->GetFileSize(dump_file_path, &file_size); 297ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (!s.ok()) { 298ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return s; 299ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 300ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 301ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai content.resize(file_size); 302ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 303ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai std::unique_ptr<RandomAccessFile> file; 304ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai s = env->NewRandomAccessFile(dump_file_path, &file); 305ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (!s.ok()) { 306ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return s; 307ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 308ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 309ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai StringPiece result; 310ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai s = file->Read(0, file_size, &result, &(content)[0]); 311ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (!s.ok()) { 312ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return s; 313ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 314ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 315ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai event->ParseFromString(content); 316ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status::OK(); 317ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 318ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 3199ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caiconst char* const DebugIO::kFileURLScheme = "file://"; 3209ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caiconst char* const DebugIO::kGrpcURLScheme = "grpc://"; 3215ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlowerconst char* const DebugIO::kMemoryURLScheme = "memcbk://"; 3229ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 3233f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Publishes debug metadata to a set of debug URLs. 324aabc7972b94af5a678550427534d4fba7fda327cShanqing CaiStatus DebugIO::PublishDebugMetadata( 325258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const int64 global_step, const int64 session_run_index, 326258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const int64 executor_step_index, const std::vector<string>& input_names, 327aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const std::vector<string>& output_names, 328aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const std::vector<string>& target_nodes, 329aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const std::unordered_set<string>& debug_urls) { 330aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai std::ostringstream oss; 331aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 332aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai // Construct a JSON string to carry the metadata. 333aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "{"; 334aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"global_step\":" << global_step << ","; 335258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai oss << "\"session_run_index\":" << session_run_index << ","; 336258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai oss << "\"executor_step_index\":" << executor_step_index << ","; 337aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"input_names\":["; 338aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai for (size_t i = 0; i < input_names.size(); ++i) { 339aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"" << input_names[i] << "\""; 340aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai if (i < input_names.size() - 1) { 341aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << ","; 342aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 343aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 344aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "],"; 345aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"output_names\":["; 346aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai for (size_t i = 0; i < output_names.size(); ++i) { 347aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"" << output_names[i] << "\""; 348aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai if (i < output_names.size() - 1) { 349aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << ","; 350aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 351aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 352aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "],"; 353aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"target_nodes\":["; 354aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai for (size_t i = 0; i < target_nodes.size(); ++i) { 355aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"" << target_nodes[i] << "\""; 356aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai if (i < target_nodes.size() - 1) { 357aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << ","; 358aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 359aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 360aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "]"; 361aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "}"; 362aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 363aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const string json_metadata = oss.str(); 364aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai Event event; 365aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai event.set_wall_time(static_cast<double>(Env::Default()->NowMicros())); 366aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai LogMessage* log_message = event.mutable_log_message(); 367aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai log_message->set_message(json_metadata); 368aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 369aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai Status status; 370aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai for (const string& url : debug_urls) { 371aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) { 37241803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 373aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai Event grpc_event; 374aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 375aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai // Determine the path (if any) in the grpc:// URL, and add it as a field 376aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai // of the JSON string. 377aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const string address = url.substr(strlen(DebugIO::kFileURLScheme)); 378aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const string path = address.find("/") == string::npos 379aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai ? "" 380aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai : address.substr(address.find("/")); 381aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai grpc_event.set_wall_time(event.wall_time()); 382aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai LogMessage* log_message_grpc = grpc_event.mutable_log_message(); 383aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai log_message_grpc->set_message( 384aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai strings::StrCat(json_metadata.substr(0, json_metadata.size() - 1), 385aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai ",\"grpc_path\":\"", path, "\"}")); 386aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 387aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai status.Update( 388ecae82d1343df293fa36e67949e5404111817110Shanqing Cai DebugGrpcIO::SendEventProtoThroughGrpcStream(grpc_event, url, true)); 389d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else 39041803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR; 391d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif 392aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } else if (str_util::Lowercase(url).find(kFileURLScheme) == 0) { 393aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const string dump_root_dir = url.substr(strlen(kFileURLScheme)); 3949142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai const string core_metadata_path = AppendTimestampToFilePath( 3959142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai io::JoinPath( 3969142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai dump_root_dir, 3975ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower strings::StrCat(DebugNodeKey::kMetadataFilePrefix, 398cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai DebugIO::kCoreMetadataTag, "sessionrun", 399258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai strings::Printf("%.14lld", session_run_index))), 4009142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai Env::Default()->NowMicros()); 4019142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai status.Update(DebugFileIO::DumpEventProtoToFile( 4029142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai event, io::Dirname(core_metadata_path).ToString(), 4039142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai io::Basename(core_metadata_path).ToString())); 404aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 405aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 406aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 407aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai return status; 408aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai} 409aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 410258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key, 411258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const Tensor& tensor, 4129ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai const uint64 wall_time_us, 413ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai const gtl::ArraySlice<string>& debug_urls, 414ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai const bool gated_grpc) { 4158b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai int32 num_failed_urls = 0; 416ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai std::vector<Status> fail_statuses; 4179ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai for (const string& url : debug_urls) { 4189ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (str_util::Lowercase(url).find(kFileURLScheme) == 0) { 4199ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai const string dump_root_dir = url.substr(strlen(kFileURLScheme)); 4209ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 421258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai Status s = DebugFileIO::DumpTensorToDir( 422258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai debug_node_key, tensor, wall_time_us, dump_root_dir, nullptr); 4239ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (!s.ok()) { 4249ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai num_failed_urls++; 425ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai fail_statuses.push_back(s); 4269ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 4279ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } else if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) { 42841803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 429ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai Status s = DebugGrpcIO::SendTensorThroughGrpcStream( 430258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai debug_node_key, tensor, wall_time_us, url, gated_grpc); 431ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 432ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (!s.ok()) { 433ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai num_failed_urls++; 434ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai fail_statuses.push_back(s); 435ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 436d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else 43741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR; 438d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif 4395ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower } else if (str_util::Lowercase(url).find(kMemoryURLScheme) == 0) { 4405ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower const string dump_root_dir = url.substr(strlen(kMemoryURLScheme)); 4415ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower auto* callback_registry = DebugCallbackRegistry::singleton(); 4425ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower auto* callback = callback_registry->GetCallback(dump_root_dir); 4435ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower CHECK(callback) << "No callback registered for: " << dump_root_dir; 4445ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower (*callback)(debug_node_key, tensor); 4459ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } else { 4469ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status(error::UNAVAILABLE, 4479ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai strings::StrCat("Invalid debug target URL: ", url)); 4489ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 4499ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 4509ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 4519ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (num_failed_urls == 0) { 4529ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status::OK(); 4539ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } else { 454ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai string error_message = strings::StrCat( 455ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai "Publishing to ", num_failed_urls, " of ", debug_urls.size(), 456ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai " debug target URLs failed, due to the following errors:"); 457ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai for (Status& status : fail_statuses) { 458ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai error_message = 459ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai strings::StrCat(error_message, " ", status.error_message(), ";"); 460ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 461ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 462ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status(error::INTERNAL, error_message); 4639ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 4649ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} 4659ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 466258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key, 467258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const Tensor& tensor, 468ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai const uint64 wall_time_us, 469ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai const gtl::ArraySlice<string>& debug_urls) { 470258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai return PublishDebugTensor(debug_node_key, tensor, wall_time_us, debug_urls, 471258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai false); 472ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 473ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 47412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing CaiStatus DebugIO::PublishGraph(const Graph& graph, const string& device_name, 4754a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai const std::unordered_set<string>& debug_urls) { 4764a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai GraphDef graph_def; 4774a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai graph.ToGraphDef(&graph_def); 4784a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 4794a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai string buf; 4804a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai graph_def.SerializeToString(&buf); 4814a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 4824a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai const int64 now_micros = Env::Default()->NowMicros(); 4834a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai Event event; 4844a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai event.set_wall_time(static_cast<double>(now_micros)); 4854a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai event.set_graph_def(buf); 4864a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 4874a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai Status status = Status::OK(); 4884a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai for (const string& debug_url : debug_urls) { 4894a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai if (debug_url.find(kFileURLScheme) == 0) { 490cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai const string dump_root_dir = 491cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai io::JoinPath(debug_url.substr(strlen(kFileURLScheme)), 492cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai DebugNodeKey::DeviceNameToDevicePath(device_name)); 493525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai const uint64 graph_hash = ::tensorflow::Hash64(buf); 494525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai const string file_name = 4955ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower strings::StrCat(DebugNodeKey::kMetadataFilePrefix, DebugIO::kGraphTag, 496525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai DebugIO::kHashTag, graph_hash, "_", now_micros); 4974a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 4984a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai status.Update( 4994a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name)); 5004a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai } else if (debug_url.find(kGrpcURLScheme) == 0) { 50141803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 50212ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai status.Update(PublishEncodedGraphDefInChunks(buf, device_name, now_micros, 50312ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai debug_url)); 504d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else 50541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR; 506d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif 5074a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai } 5084a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai } 5094a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 5104a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai return status; 5114a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai} 5124a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 513617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Caibool DebugIO::IsCopyNodeGateOpen( 514617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai const std::vector<DebugWatchAndURLSpec>& specs) { 51541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 516617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai for (const DebugWatchAndURLSpec& spec : specs) { 517617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai if (!spec.gated_grpc || spec.url.compare(0, strlen(DebugIO::kGrpcURLScheme), 518617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai DebugIO::kGrpcURLScheme)) { 519617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai return true; 520617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai } else { 5213c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (DebugGrpcIO::IsReadGateOpen(spec.url, spec.watch_key)) { 522617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai return true; 523617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai } 524617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai } 525617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai } 526617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai return false; 527617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai#else 528617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai return true; 529617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai#endif 530617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai} 531617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai 532ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Caibool DebugIO::IsDebugNodeGateOpen(const string& watch_key, 533ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai const std::vector<string>& debug_urls) { 53441803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 535ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai for (const string& debug_url : debug_urls) { 536617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai if (debug_url.compare(0, strlen(DebugIO::kGrpcURLScheme), 537617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai DebugIO::kGrpcURLScheme)) { 538ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return true; 539ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } else { 5403c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (DebugGrpcIO::IsReadGateOpen(debug_url, watch_key)) { 541ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return true; 542ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 543ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 544ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 545ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return false; 546ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#else 547ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return true; 548ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#endif 549ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 550ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 551ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Caibool DebugIO::IsDebugURLGateOpen(const string& watch_key, 552ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai const string& debug_url) { 55341803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 554ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai if (debug_url.find(kGrpcURLScheme) != 0) { 555ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return true; 556ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } else { 5573c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return DebugGrpcIO::IsReadGateOpen(debug_url, watch_key); 558ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 559ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#else 560ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return true; 561ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#endif 562ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 563ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 564ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing CaiStatus DebugIO::CloseDebugURL(const string& debug_url) { 565ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (debug_url.find(DebugIO::kGrpcURLScheme) == 0) { 56641803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 567aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai return DebugGrpcIO::CloseGrpcStream(debug_url); 568d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else 56941803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR; 570d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif 571ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } else { 572ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai // No-op for non-gRPC URLs. 573ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status::OK(); 574ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 575ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 576ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 577258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key, 578258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const Tensor& tensor, 579258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const uint64 wall_time_us, 580258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const string& dump_root_dir, 581258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai string* dump_file_path) { 582258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const string file_path = 583258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai GetDumpFilePath(dump_root_dir, debug_node_key, wall_time_us); 5849ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 5859ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (dump_file_path != nullptr) { 5869ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai *dump_file_path = file_path; 5879ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 5889ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 589258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai return DumpTensorToEventFile(debug_node_key, tensor, wall_time_us, file_path); 5909ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} 5919ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 5929ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caistring DebugFileIO::GetDumpFilePath(const string& dump_root_dir, 593258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const DebugNodeKey& debug_node_key, 5949ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai const uint64 wall_time_us) { 5959142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai return AppendTimestampToFilePath( 596cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai io::JoinPath(dump_root_dir, debug_node_key.device_path, 597258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai strings::StrCat(debug_node_key.node_name, "_", 598258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai debug_node_key.output_slot, "_", 599258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai debug_node_key.debug_op)), 6009142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai wall_time_us); 6019ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} 6029ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6034a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing CaiStatus DebugFileIO::DumpEventProtoToFile(const Event& event_proto, 6044a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai const string& dir_name, 6054a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai const string& file_name) { 6069ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai Env* env(Env::Default()); 6079ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6084a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai Status s = RecursiveCreateDir(env, dir_name); 6099ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (!s.ok()) { 6109ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status(error::FAILED_PRECONDITION, 6114a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai strings::StrCat("Failed to create directory ", dir_name, 6129ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai ", due to: ", s.error_message())); 6139ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 6149ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6154a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai const string file_path = io::JoinPath(dir_name, file_name); 6169ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6179ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai string event_str; 6184a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai event_proto.SerializeToString(&event_str); 6199ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6209ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai std::unique_ptr<WritableFile> f = nullptr; 6219ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai TF_CHECK_OK(env->NewWritableFile(file_path, &f)); 622bc225bfaa534acc25047fe844f19edc333b7a76aPeter Hawkins f->Append(event_str).IgnoreError(); 6239ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai TF_CHECK_OK(f->Close()); 6249ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6259ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status::OK(); 6269ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} 6279ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 628258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugFileIO::DumpTensorToEventFile(const DebugNodeKey& debug_node_key, 629258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const Tensor& tensor, 630258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const uint64 wall_time_us, 631258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const string& file_path) { 6323f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai std::vector<Event> events; 6333f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai TF_RETURN_IF_ERROR( 6343f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai WrapTensorAsEvents(debug_node_key, tensor, wall_time_us, 0, &events)); 6353f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return DumpEventProtoToFile(events[0], io::Dirname(file_path).ToString(), 6363f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai io::Basename(file_path).ToString()); 6374a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai} 6384a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 6399ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiStatus DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) { 640879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu if (env->FileExists(dir).ok() && env->IsDirectory(dir).ok()) { 6419ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // The path already exists as a directory. Return OK right away. 6429ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status::OK(); 6439ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 6449ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6459ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai string parent_dir = io::Dirname(dir).ToString(); 646879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu if (!env->FileExists(parent_dir).ok()) { 6479ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // The parent path does not exist yet, create it first. 6489ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai Status s = RecursiveCreateDir(env, parent_dir); // Recursive call 6499ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (!s.ok()) { 6509ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status( 6519ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai error::FAILED_PRECONDITION, 6529ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai strings::StrCat("Failed to create directory ", parent_dir)); 6539ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 654879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu } else if (env->FileExists(parent_dir).ok() && 6559ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai !env->IsDirectory(parent_dir).ok()) { 6569ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // The path exists, but it is a file. 6579ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status(error::FAILED_PRECONDITION, 6589ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai strings::StrCat("Failed to create directory ", parent_dir, 6599ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai " because the path exists as a file ")); 6609ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 6619ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 662bc225bfaa534acc25047fe844f19edc333b7a76aPeter Hawkins env->CreateDir(dir).IgnoreError(); 6639ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // Guard against potential race in creating directories by doing a check 6649ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // after the CreateDir call. 665879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu if (env->FileExists(dir).ok() && env->IsDirectory(dir).ok()) { 6669ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status::OK(); 6679ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } else { 6689ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status(error::ABORTED, 6699ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai strings::StrCat("Failed to create directory ", parent_dir)); 6709ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 6719ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} 6729ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 67341803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 674ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing CaiDebugGrpcChannel::DebugGrpcChannel(const string& server_stream_addr) 6758b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai : server_stream_addr_(server_stream_addr), 6768b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai url_(strings::StrCat(DebugIO::kGrpcURLScheme, server_stream_addr)) {} 6778b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai 6788b219918214f779b0f4c7785ae93feffa6e492c3Shanqing CaiStatus DebugGrpcChannel::Connect(const int64 timeout_micros) { 6798b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai ::grpc::ChannelArguments args; 6808b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max()); 6818b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai // Avoid problems where default reconnect backoff is too long (e.g., 20 s). 6828b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000); 68359ae0c0f9ac654bd668fb633feef3dbe26bae8eeShanqing Cai channel_ = ::grpc::CreateCustomChannel( 68459ae0c0f9ac654bd668fb633feef3dbe26bae8eeShanqing Cai server_stream_addr_, ::grpc::InsecureChannelCredentials(), args); 6858b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai if (!channel_->WaitForConnected( 6868b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai gpr_time_add(gpr_now(GPR_CLOCK_REALTIME), 6878b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai gpr_time_from_micros(timeout_micros, GPR_TIMESPAN)))) { 6888b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai return errors::FailedPrecondition( 6898b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai "Failed to connect to gRPC channel at ", server_stream_addr_, 6908b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai " within a timeout of ", timeout_micros / 1e6, " s."); 6918b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai } 6928b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai stub_ = EventListener::NewStub(channel_); 6938b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai reader_writer_ = stub_->SendEvents(&ctx_); 694258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai 6958b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai return Status::OK(); 696ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 697ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 698ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Caibool DebugGrpcChannel::WriteEvent(const Event& event) { 699ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai mutex_lock l(mu_); 700ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return reader_writer_->Write(event); 701ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 702ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 7033c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugGrpcChannel::ReadEventReply(EventReply* event_reply) { 704639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai mutex_lock l(mu_); 7053c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return reader_writer_->Read(event_reply); 7063c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai} 7073c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai 708ecae82d1343df293fa36e67949e5404111817110Shanqing Caivoid DebugGrpcChannel::ReceiveAndProcessEventReplies(const size_t max_replies) { 709ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai EventReply event_reply; 710ecae82d1343df293fa36e67949e5404111817110Shanqing Cai size_t num_replies = 0; 711ecae82d1343df293fa36e67949e5404111817110Shanqing Cai while ((max_replies == 0 || ++num_replies <= max_replies) && 712ecae82d1343df293fa36e67949e5404111817110Shanqing Cai ReadEventReply(&event_reply)) { 713ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai for (const EventReply::DebugOpStateChange& debug_op_state_change : 714ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai event_reply.debug_op_state_changes()) { 715ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai string watch_key = strings::StrCat(debug_op_state_change.node_name(), ":", 716ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai debug_op_state_change.output_slot(), 717ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai ":", debug_op_state_change.debug_op()); 7183c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai DebugGrpcIO::SetDebugNodeKeyGrpcState(url_, watch_key, 7193c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai debug_op_state_change.state()); 720ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 721ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 722ecae82d1343df293fa36e67949e5404111817110Shanqing Cai} 723ecae82d1343df293fa36e67949e5404111817110Shanqing Cai 724ecae82d1343df293fa36e67949e5404111817110Shanqing CaiStatus DebugGrpcChannel::ReceiveServerRepliesAndClose() { 725ecae82d1343df293fa36e67949e5404111817110Shanqing Cai reader_writer_->WritesDone(); 726ecae82d1343df293fa36e67949e5404111817110Shanqing Cai // Read all EventReply messages (if any) from the server. 727ecae82d1343df293fa36e67949e5404111817110Shanqing Cai ReceiveAndProcessEventReplies(0); 728ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 729ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (reader_writer_->Finish().ok()) { 730ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status::OK(); 731ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } else { 732ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status(error::FAILED_PRECONDITION, 733ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai "Failed to close debug GRPC stream."); 734ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 735ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 736ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 737f93c48dc061d23495a4425fcad17d55159cb02b1A. Unique TensorFlowermutex DebugGrpcIO::streams_mu(LINKER_INITIALIZED); 738ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 7398b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Caiint64 DebugGrpcIO::channel_connection_timeout_micros = 900 * 1000 * 1000; 7408b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai// TODO(cais): Make this configurable? 7418b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai 7423f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst size_t DebugGrpcIO::kGrpcMessageSizeLimitBytes = 4000 * 1024; 7433f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 7443f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst size_t DebugGrpcIO::kGrpcMaxVarintLengthSize = 6; 7453f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 746639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Caistd::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>* 747ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing CaiDebugGrpcIO::GetStreamChannels() { 748639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai static std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>* 749ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai stream_channels = 750639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai new std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>(); 751ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return stream_channels; 752ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 753ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 754ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing CaiStatus DebugGrpcIO::SendTensorThroughGrpcStream( 755258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const DebugNodeKey& debug_node_key, const Tensor& tensor, 756258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const uint64 wall_time_us, const string& grpc_stream_url, 757258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const bool gated) { 7583c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (gated && 7593c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai !IsReadGateOpen(grpc_stream_url, debug_node_key.debug_node_name)) { 760ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return Status::OK(); 761ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } else { 7623f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai std::vector<Event> events; 7633f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai TF_RETURN_IF_ERROR(WrapTensorAsEvents(debug_node_key, tensor, wall_time_us, 7643f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai kGrpcMessageSizeLimitBytes, &events)); 7653f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai for (const Event& event : events) { 7663f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai TF_RETURN_IF_ERROR( 7673f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai SendEventProtoThroughGrpcStream(event, grpc_stream_url)); 7683f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 7693c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (IsWriteGateOpen(grpc_stream_url, debug_node_key.debug_node_name)) { 770639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai DebugGrpcChannel* debug_grpc_channel = nullptr; 771639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai TF_RETURN_IF_ERROR( 772639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai GetOrCreateDebugGrpcChannel(grpc_stream_url, &debug_grpc_channel)); 773639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai debug_grpc_channel->ReceiveAndProcessEventReplies(1); 7743c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai // TODO(cais): Support new tensor value carried in the EventReply for 7753c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai // overriding the value of the tensor being published. 7763c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai } 7773c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return Status::OK(); 7783c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai } 7793c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai} 7803c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai 7813c482c66b5a1f74875969e96834ff7564e829668Shanqing CaiStatus DebugGrpcIO::ReceiveEventReplyProtoThroughGrpcStream( 7823c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai EventReply* event_reply, const string& grpc_stream_url) { 783639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai DebugGrpcChannel* debug_grpc_channel = nullptr; 784639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai TF_RETURN_IF_ERROR( 785639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai GetOrCreateDebugGrpcChannel(grpc_stream_url, &debug_grpc_channel)); 7863c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (debug_grpc_channel->ReadEventReply(event_reply)) { 7873f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return Status::OK(); 7883c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai } else { 7893c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return errors::Cancelled(strings::StrCat( 7903c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai "Reading EventReply from stream URL ", grpc_stream_url, " failed.")); 791ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 7924a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai} 7934a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 794639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing CaiStatus DebugGrpcIO::GetOrCreateDebugGrpcChannel( 795639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai const string& grpc_stream_url, DebugGrpcChannel** debug_grpc_channel) { 796aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const string addr_with_path = 797258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai grpc_stream_url.find(DebugIO::kGrpcURLScheme) == 0 798258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai ? grpc_stream_url.substr(strlen(DebugIO::kGrpcURLScheme)) 799258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai : grpc_stream_url; 800aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const string server_stream_addr = 801aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai addr_with_path.substr(0, addr_with_path.find('/')); 802ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai { 803ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai mutex_lock l(streams_mu); 804639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>* 805ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai stream_channels = GetStreamChannels(); 806ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai if (stream_channels->find(grpc_stream_url) == stream_channels->end()) { 807639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai std::unique_ptr<DebugGrpcChannel> channel( 808639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai new DebugGrpcChannel(server_stream_addr)); 809639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai TF_RETURN_IF_ERROR(channel->Connect(channel_connection_timeout_micros)); 810639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai stream_channels->insert( 811639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai std::make_pair(grpc_stream_url, std::move(channel))); 812ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 813639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai *debug_grpc_channel = (*stream_channels)[grpc_stream_url].get(); 814ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 815639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai return Status::OK(); 816639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai} 817639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai 818639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing CaiStatus DebugGrpcIO::SendEventProtoThroughGrpcStream( 819639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai const Event& event_proto, const string& grpc_stream_url, 820639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai const bool receive_reply) { 821639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai DebugGrpcChannel* debug_grpc_channel; 822639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai TF_RETURN_IF_ERROR( 823639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai GetOrCreateDebugGrpcChannel(grpc_stream_url, &debug_grpc_channel)); 824ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 8254a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai bool write_ok = debug_grpc_channel->WriteEvent(event_proto); 826ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (!write_ok) { 827ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return errors::Cancelled(strings::StrCat("Write event to stream URL ", 8288b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai grpc_stream_url, " failed.")); 829ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 830ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 831ecae82d1343df293fa36e67949e5404111817110Shanqing Cai if (receive_reply) { 832ecae82d1343df293fa36e67949e5404111817110Shanqing Cai debug_grpc_channel->ReceiveAndProcessEventReplies(1); 833ecae82d1343df293fa36e67949e5404111817110Shanqing Cai } 834ecae82d1343df293fa36e67949e5404111817110Shanqing Cai 835ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status::OK(); 836ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 837ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 8383c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugGrpcIO::IsReadGateOpen(const string& grpc_debug_url, 8393c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const string& watch_key) { 8403c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const DebugNodeName2State* enabled_node_to_state = 8413c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai GetEnabledDebugOpStatesAtUrl(grpc_debug_url); 8423c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return enabled_node_to_state->find(watch_key) != enabled_node_to_state->end(); 8433c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai} 8443c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai 8453c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugGrpcIO::IsWriteGateOpen(const string& grpc_debug_url, 8463c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const string& watch_key) { 8473c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const DebugNodeName2State* enabled_node_to_state = 8483c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai GetEnabledDebugOpStatesAtUrl(grpc_debug_url); 8493c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai auto it = enabled_node_to_state->find(watch_key); 8503c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (it == enabled_node_to_state->end()) { 851ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return false; 852ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } else { 8533c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return it->second == EventReply::DebugOpStateChange::READ_WRITE; 854ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 855ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 856ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 857aabc7972b94af5a678550427534d4fba7fda327cShanqing CaiStatus DebugGrpcIO::CloseGrpcStream(const string& grpc_stream_url) { 858ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai mutex_lock l(streams_mu); 859ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 860639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>* 861ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai stream_channels = GetStreamChannels(); 862ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai if (stream_channels->find(grpc_stream_url) != stream_channels->end()) { 863ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai // Stream of the specified address exists. Close it and remove it from 864ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai // record. 865639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai Status s = 866639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai (*stream_channels)[grpc_stream_url]->ReceiveServerRepliesAndClose(); 867ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai (*stream_channels).erase(grpc_stream_url); 868ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return s; 869ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } else { 870ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai // Stream of the specified address does not exist. No action. 871ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status::OK(); 872ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 873ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 874ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 8753c482c66b5a1f74875969e96834ff7564e829668Shanqing Caistd::unordered_map<string, DebugGrpcIO::DebugNodeName2State>* 8763c482c66b5a1f74875969e96834ff7564e829668Shanqing CaiDebugGrpcIO::GetEnabledDebugOpStates() { 8773c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai static std::unordered_map<string, DebugNodeName2State>* 8783c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai enabled_debug_op_states = 8793c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai new std::unordered_map<string, DebugNodeName2State>(); 8803c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return enabled_debug_op_states; 881ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 882ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 8833c482c66b5a1f74875969e96834ff7564e829668Shanqing CaiDebugGrpcIO::DebugNodeName2State* DebugGrpcIO::GetEnabledDebugOpStatesAtUrl( 8843c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const string& grpc_debug_url) { 885c1f69be22e151e2d051f41fccf436767eee4a26aShanqing Cai static mutex* debug_ops_state_mu = new mutex(); 8863c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai std::unordered_map<string, DebugNodeName2State>* states = 8873c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai GetEnabledDebugOpStates(); 888c1f69be22e151e2d051f41fccf436767eee4a26aShanqing Cai 889c1f69be22e151e2d051f41fccf436767eee4a26aShanqing Cai mutex_lock l(*debug_ops_state_mu); 8903c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (states->find(grpc_debug_url) == states->end()) { 8913c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai DebugNodeName2State url_enabled_debug_op_states; 8923c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai (*states)[grpc_debug_url] = url_enabled_debug_op_states; 893ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 8943c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return &(*states)[grpc_debug_url]; 895ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 896ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 8973c482c66b5a1f74875969e96834ff7564e829668Shanqing Caivoid DebugGrpcIO::SetDebugNodeKeyGrpcState( 8983c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const string& grpc_debug_url, const string& watch_key, 8993c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const EventReply::DebugOpStateChange::State new_state) { 9003c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai DebugNodeName2State* states = GetEnabledDebugOpStatesAtUrl(grpc_debug_url); 9013c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (new_state == EventReply::DebugOpStateChange::DISABLED) { 9023c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (states->find(watch_key) == states->end()) { 9033c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai LOG(ERROR) << "Attempt to disable a watch key that is not currently " 9043c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai << "enabled at " << grpc_debug_url << ": " << watch_key; 905ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } else { 9063c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai states->erase(watch_key); 907ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 9083c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai } else if (new_state != EventReply::DebugOpStateChange::STATE_UNSPECIFIED) { 9093c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai (*states)[watch_key] = new_state; 910ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 911ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 912ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 9133c482c66b5a1f74875969e96834ff7564e829668Shanqing Caivoid DebugGrpcIO::ClearEnabledWatchKeys() { 9143c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai GetEnabledDebugOpStates()->clear(); 915ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 916ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 91741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#endif // #ifndef PLATFORM_WINDOWS 918ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 9199ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} // namespace tensorflow 920