debug_io_utils.cc revision c1f69be22e151e2d051f41fccf436767eee4a26a
19ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 29ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 39ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiLicensed under the Apache License, Version 2.0 (the "License"); 49ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caiyou may not use this file except in compliance with the License. 59ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiYou may obtain a copy of the License at 69ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 79ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai http://www.apache.org/licenses/LICENSE-2.0 89ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 99ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiUnless required by applicable law or agreed to in writing, software 109ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caidistributed under the License is distributed on an "AS IS" BASIS, 119ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 129ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiSee the License for the specific language governing permissions and 139ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cailimitations under the License. 149ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai==============================================================================*/ 159ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 169ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/debug/debug_io_utils.h" 179ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 183c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <stddef.h> 193c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <string.h> 203c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <cmath> 213c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <limits> 223c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <utility> 239ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include <vector> 249ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 2541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 26ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai#include "grpc++/create_channel.h" 2741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#else 281cb96893a64f59b7265f9def9968f7bed1e57662Andrew Harp// winsock2.h is used in grpc, so Ws2_32.lib is needed 291cb96893a64f59b7265f9def9968f7bed1e57662Andrew Harp#pragma comment(lib,"Ws2_32.lib") 3041803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#endif // #ifndef PLATFORM_WINDOWS 311cb96893a64f59b7265f9def9968f7bed1e57662Andrew Harp 321e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower#include "tensorflow/core/debug/debugger_event_metadata.pb.h" 33e85d3df92deb9d717befdf173966a2913ac2aea0Geoffrey Irving#include "tensorflow/core/framework/graph.pb.h" 349ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/framework/summary.pb.h" 353f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#include "tensorflow/core/lib/core/bits.h" 3612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai#include "tensorflow/core/lib/hash/hash.h" 379ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/lib/io/path.h" 389ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/lib/strings/str_util.h" 399142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai#include "tensorflow/core/lib/strings/stringprintf.h" 401e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower#include "tensorflow/core/platform/protobuf.h" 419ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/util/event.pb.h" 429ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 4341803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#define GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR \ 4441803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai return errors::Unimplemented( \ 4541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai kGrpcURLScheme, " debug URL scheme is not implemented on Windows yet.") 46d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai 479ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cainamespace tensorflow { 489ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 499ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cainamespace { 509ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 513f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Creates an Event proto representing a chunk of a Tensor. This method only 523f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// populates the field of the Event proto that represent the envelope 533f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// informaion (e.g., timestmap, device_name, num_chunks, chunk_index, dtype, 543f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// shape). It does not set the value.tensor field, which should be set by the 553f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// caller separately. 563f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing CaiEvent PrepareChunkEventProto(const DebugNodeKey& debug_node_key, 573f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const uint64 wall_time_us, const size_t num_chunks, 583f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t chunk_index, 593f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const DataType& tensor_dtype, 603f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const TensorShapeProto& tensor_shape) { 619ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai Event event; 629ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai event.set_wall_time(static_cast<double>(wall_time_us)); 633f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai Summary::Value* value = event.mutable_summary()->add_value(); 649ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 659ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // Create the debug node_name in the Summary proto. 669ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // For example, if tensor_name = "foo/node_a:0", and the debug_op is 679ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // "DebugIdentity", the debug node_name in the Summary proto will be 689ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // "foo/node_a:0:DebugIdentity". 693f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai value->set_node_name(debug_node_key.debug_node_name); 701e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower 713f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // Tag by the node name. This allows TensorBoard to quickly fetch data 723f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // per op. 733f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai value->set_tag(debug_node_key.node_name); 741e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower 751e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower // Store data within debugger metadata to be stored for each event. 761e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower third_party::tensorflow::core::debug::DebuggerEventMetadata metadata; 771e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower metadata.set_device(debug_node_key.device_name); 781e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower metadata.set_output_slot(debug_node_key.output_slot); 793f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai metadata.set_num_chunks(num_chunks); 803f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai metadata.set_chunk_index(chunk_index); 811e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower 821e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower // Encode the data in JSON. 831e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower string json_output; 841e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower tensorflow::protobuf::util::JsonPrintOptions json_options; 851e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower json_options.always_print_primitive_fields = true; 861e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower auto status = tensorflow::protobuf::util::MessageToJsonString( 871e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower metadata, &json_output, json_options); 881e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower if (status.ok()) { 891e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower // Store summary metadata. Set the plugin to use this data as "debugger". 901e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower SummaryMetadata::PluginData* plugin_data = 914c60c96257bfd54a036d15af979e90fc0b4e400dA. Unique TensorFlower value->mutable_metadata()->mutable_plugin_data(); 923f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai plugin_data->set_plugin_name(DebugIO::kDebuggerPluginName); 931e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower plugin_data->set_content(json_output); 941e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower } else { 951e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower LOG(WARNING) << "Failed to convert DebuggerEventMetadata proto to JSON. " 961e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower << "The debug_node_name is " << debug_node_key.debug_node_name 971e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower << "."; 981e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower } 999ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 1003f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai value->mutable_tensor()->set_dtype(tensor_dtype); 1013f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai *value->mutable_tensor()->mutable_tensor_shape() = tensor_shape; 1023f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1033f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return event; 1043f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai} 1053f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1063f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Translates the length of a string to number of bytes when the string is 1073f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// encoded as bytes in protobuf. Note that this makes a conservative estimate 1083f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// (i.e., an estimate that is usually too large, but never too small under the 1093f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// gRPC message size limit) of the Varint-encoded length, to workaround the lack 1103f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// of a portable length function. 1113f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst size_t StringValMaxBytesInProto(const string& str) { 1123f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#if defined(PLATFORM_GOOGLE) 1133f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return str.size() + DebugGrpcIO::kGrpcMaxVarintLengthSize; 1143f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#else 1153f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return str.size(); 1163f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#endif 1173f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai} 1183f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1193f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Breaks a string Tensor (represented as a TensorProto) as a vector of Event 1203f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// protos. 1213f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing CaiStatus WrapStringTensorAsEvents(const DebugNodeKey& debug_node_key, 1223f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const uint64 wall_time_us, 1233f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t chunk_size_limit, 1243f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai TensorProto* tensor_proto, 1253f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai std::vector<Event>* events) { 1263f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const protobuf::RepeatedPtrField<string>& strs = tensor_proto->string_val(); 1273f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t num_strs = strs.size(); 1283f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t chunk_size_ub = chunk_size_limit > 0 1293f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai ? chunk_size_limit 1303f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai : std::numeric_limits<size_t>::max(); 1313f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1323f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // E.g., if cutoffs is {j, k, l}, the chunks will have index ranges: 1333f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // [0:a), [a:b), [c:<end>]. 1343f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai std::vector<size_t> cutoffs; 1353f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai size_t chunk_size = 0; 1363f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai for (size_t i = 0; i < num_strs; ++i) { 1373f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // Take into account the extra bytes in proto buffer. 1383f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai if (StringValMaxBytesInProto(strs[i]) > chunk_size_ub) { 1393f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return errors::FailedPrecondition( 1403f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai "string value at index ", i, " from debug node ", 1413f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai debug_node_key.debug_node_name, 1423f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai " does not fit gRPC message size limit (", chunk_size_ub, ")"); 1433f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 1443f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai if (chunk_size + StringValMaxBytesInProto(strs[i]) > chunk_size_ub) { 1453f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai cutoffs.push_back(i); 1463f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai chunk_size = 0; 1473f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 1483f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai chunk_size += StringValMaxBytesInProto(strs[i]); 1493f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 1503f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai cutoffs.push_back(num_strs); 1513f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t num_chunks = cutoffs.size(); 1523f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1533f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai for (size_t i = 0; i < num_chunks; ++i) { 1543f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai Event event = PrepareChunkEventProto(debug_node_key, wall_time_us, 1553f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai num_chunks, i, tensor_proto->dtype(), 1563f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai tensor_proto->tensor_shape()); 1573f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai Summary::Value* value = event.mutable_summary()->mutable_value(0); 1583f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1593f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai if (cutoffs.size() == 1) { 1603f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai value->mutable_tensor()->mutable_string_val()->Swap( 1613f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai tensor_proto->mutable_string_val()); 1623f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } else { 1633f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t begin = (i == 0) ? 0 : cutoffs[i - 1]; 1643f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t end = cutoffs[i]; 1653f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai for (size_t j = begin; j < end; ++j) { 1663f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai value->mutable_tensor()->add_string_val(strs[j]); 1673f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 1683f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 1693f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1703f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai events->push_back(std::move(event)); 1713f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 1723f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1733f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return Status::OK(); 1743f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai} 1753f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1763f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Encapsulates the tensor value inside a vector of Event protos. Large tensors 1773f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// are broken up to multiple protos to fit the chunk_size_limit. In each Event 1783f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// proto the field summary.tensor carries the content of the tensor. 1793f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// If chunk_size_limit <= 0, the tensor will not be broken into chunks, i.e., a 1803f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// length-1 vector will be returned, regardless of the size of the tensor. 1813f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing CaiStatus WrapTensorAsEvents(const DebugNodeKey& debug_node_key, 1823f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const Tensor& tensor, const uint64 wall_time_us, 1833f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t chunk_size_limit, 1843f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai std::vector<Event>* events) { 1853f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai TensorProto tensor_proto; 1869ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (tensor.dtype() == DT_STRING) { 1873f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // Treat DT_STRING specially, so that tensor_util.MakeNdarray in Python can 1883f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // convert the TensorProto to string-type numpy array. MakeNdarray does not 1893f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // work with strings encoded by AsProtoTensorContent() in tensor_content. 1903f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai tensor.AsProtoField(&tensor_proto); 1913f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1923f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai TF_RETURN_IF_ERROR(WrapStringTensorAsEvents( 1933f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai debug_node_key, wall_time_us, chunk_size_limit, &tensor_proto, events)); 1949ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } else { 1953f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai tensor.AsProtoTensorContent(&tensor_proto); 1963f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 1973f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t total_length = tensor_proto.tensor_content().size(); 1983f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t chunk_size_ub = 1993f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai chunk_size_limit > 0 ? chunk_size_limit : total_length; 2003f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t num_chunks = 2013f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai (total_length == 0) 2023f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai ? 1 2033f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai : (total_length + chunk_size_ub - 1) / chunk_size_ub; 2043f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai for (size_t i = 0; i < num_chunks; ++i) { 2053f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t pos = i * chunk_size_ub; 2063f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t len = 2073f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai (i == num_chunks - 1) ? (total_length - pos) : chunk_size_ub; 2083f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai Event event = PrepareChunkEventProto(debug_node_key, wall_time_us, 2093f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai num_chunks, i, tensor_proto.dtype(), 2103f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai tensor_proto.tensor_shape()); 2113f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai event.mutable_summary() 2123f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai ->mutable_value(0) 2133f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai ->mutable_tensor() 2143f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai ->set_tensor_content(tensor_proto.tensor_content().substr(pos, len)); 2153f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai events->push_back(std::move(event)); 2163f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 2179ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 2189ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 2193f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return Status::OK(); 2209ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} 2219ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 2223f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Appends an underscore and a timestamp to a file path. If the path already 2239142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// exists on the file system, append a hyphen and a 1-up index. Consecutive 2249142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// values of the index will be tried until the first unused one is found. 2259142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// TOCTOU race condition is not of concern here due to the fact that tfdbg 2269142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// sets parallel_iterations attribute of all while_loops to 1 to prevent 2279142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// the same node from between executed multiple times concurrently. 2289142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Caistring AppendTimestampToFilePath(const string& in, const uint64 timestamp) { 2299142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai string out = strings::StrCat(in, "_", timestamp); 2309142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai 2319142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai uint64 i = 1; 2329142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai while (Env::Default()->FileExists(out).ok()) { 2339142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai out = strings::StrCat(in, "_", timestamp, "-", i); 2349142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai ++i; 2359142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai } 2369142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai return out; 2379142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai} 2389142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai 23941803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 2403f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Publishes encoded GraphDef through a gRPC debugger stream, in chunks, 2413f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// conforming to the gRPC message size limit. 24212ac2f34fadc8802121382c64588d9f9c2f58390Shanqing CaiStatus PublishEncodedGraphDefInChunks(const string& encoded_graph_def, 24312ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai const string& device_name, 24412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai const int64 wall_time, 24512ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai const string& debug_url) { 24612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai const uint64 hash = ::tensorflow::Hash64(encoded_graph_def); 24712ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai const size_t total_length = encoded_graph_def.size(); 2483f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t num_chunks = 2493f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai static_cast<size_t>(std::ceil(static_cast<float>(total_length) / 2503f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai DebugGrpcIO::kGrpcMessageSizeLimitBytes)); 25112ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai for (size_t i = 0; i < num_chunks; ++i) { 2523f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t pos = i * DebugGrpcIO::kGrpcMessageSizeLimitBytes; 2533f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai const size_t len = (i == num_chunks - 1) 2543f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai ? (total_length - pos) 2553f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai : DebugGrpcIO::kGrpcMessageSizeLimitBytes; 25612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai Event event; 25712ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai event.set_wall_time(static_cast<double>(wall_time)); 25812ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai // Prefix the chunk with 25912ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai // <hash64>,<device_name>,<wall_time>|<index>|<num_chunks>|. 2603f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // TODO(cais): Use DebuggerEventMetadata to store device_name, num_chunks 2613f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai // and chunk_index, instead. 26212ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai event.set_graph_def(strings::StrCat(hash, ",", device_name, ",", wall_time, 26312ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai "|", i, "|", num_chunks, "|", 26412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai encoded_graph_def.substr(pos, len))); 26512ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai if (!DebugGrpcIO::SendEventProtoThroughGrpcStream(event, debug_url).ok()) { 26612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai return errors::FailedPrecondition( 26712ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai "Failed to send chunk ", i, " of ", num_chunks, 26812ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai " of encoded GraphDef of size ", encoded_graph_def.size(), " bytes"); 26912ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai } 27012ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai } 27112ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai return Status::OK(); 27212ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai} 27341803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#endif // #ifndef PLATFORM_WINDOWS 27412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai 2759ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} // namespace 2769ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 277cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai// static 2783f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst char* const DebugIO::kDebuggerPluginName = "debugger"; 2793f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 2803f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// static 281cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst char* const DebugIO::kMetadataFilePrefix = "_tfdbg_"; 282cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai 283cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai// static 284cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst char* const DebugIO::kCoreMetadataTag = "core_metadata_"; 285cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai 286cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai// static 287cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst char* const DebugIO::kDeviceTag = "device_"; 288cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai 289cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai// static 290cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst char* const DebugIO::kGraphTag = "graph_"; 291cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai 292525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai// static 293525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Caiconst char* const DebugIO::kHashTag = "hash"; 294525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai 295258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiDebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name, 296258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const int32 output_slot, const string& debug_op) 297258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai : device_name(device_name), 298258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai node_name(node_name), 299258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai output_slot(output_slot), 300258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai debug_op(debug_op), 301258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai debug_node_name( 302cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai strings::StrCat(node_name, ":", output_slot, ":", debug_op)), 303cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai device_path(DeviceNameToDevicePath(device_name)) {} 304258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai 3053c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugNodeKey::operator==(const DebugNodeKey& other) const { 3063c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return (device_name == other.device_name && node_name == other.node_name && 3073c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai output_slot == other.output_slot && debug_op == other.debug_op); 3083c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai} 3093c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai 3103c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugNodeKey::operator!=(const DebugNodeKey& other) const { 3113c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return !((*this) == other); 3123c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai} 3133c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai 314ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing CaiStatus ReadEventFromFile(const string& dump_file_path, Event* event) { 315ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai Env* env(Env::Default()); 316ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 317ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai string content; 318ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai uint64 file_size = 0; 319ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 320ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai Status s = env->GetFileSize(dump_file_path, &file_size); 321ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (!s.ok()) { 322ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return s; 323ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 324ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 325ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai content.resize(file_size); 326ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 327ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai std::unique_ptr<RandomAccessFile> file; 328ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai s = env->NewRandomAccessFile(dump_file_path, &file); 329ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (!s.ok()) { 330ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return s; 331ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 332ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 333ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai StringPiece result; 334ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai s = file->Read(0, file_size, &result, &(content)[0]); 335ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (!s.ok()) { 336ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return s; 337ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 338ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 339ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai event->ParseFromString(content); 340ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status::OK(); 341ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 342ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 3439ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai// static 344cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst string DebugNodeKey::DeviceNameToDevicePath(const string& device_name) { 345cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai return strings::StrCat( 346cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag, 347cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai str_util::StringReplace( 348cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai str_util::StringReplace(device_name, ":", "_", true), "/", ",", 349cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai true)); 350cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai} 351cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai 352cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai// static 3539ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caiconst char* const DebugIO::kFileURLScheme = "file://"; 3549ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai// static 3559ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caiconst char* const DebugIO::kGrpcURLScheme = "grpc://"; 3569ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 3573f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Publishes debug metadata to a set of debug URLs. 3584a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai// static 359aabc7972b94af5a678550427534d4fba7fda327cShanqing CaiStatus DebugIO::PublishDebugMetadata( 360258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const int64 global_step, const int64 session_run_index, 361258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const int64 executor_step_index, const std::vector<string>& input_names, 362aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const std::vector<string>& output_names, 363aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const std::vector<string>& target_nodes, 364aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const std::unordered_set<string>& debug_urls) { 365aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai std::ostringstream oss; 366aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 367aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai // Construct a JSON string to carry the metadata. 368aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "{"; 369aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"global_step\":" << global_step << ","; 370258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai oss << "\"session_run_index\":" << session_run_index << ","; 371258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai oss << "\"executor_step_index\":" << executor_step_index << ","; 372aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"input_names\":["; 373aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai for (size_t i = 0; i < input_names.size(); ++i) { 374aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"" << input_names[i] << "\""; 375aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai if (i < input_names.size() - 1) { 376aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << ","; 377aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 378aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 379aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "],"; 380aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"output_names\":["; 381aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai for (size_t i = 0; i < output_names.size(); ++i) { 382aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"" << output_names[i] << "\""; 383aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai if (i < output_names.size() - 1) { 384aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << ","; 385aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 386aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 387aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "],"; 388aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"target_nodes\":["; 389aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai for (size_t i = 0; i < target_nodes.size(); ++i) { 390aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "\"" << target_nodes[i] << "\""; 391aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai if (i < target_nodes.size() - 1) { 392aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << ","; 393aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 394aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 395aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "]"; 396aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai oss << "}"; 397aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 398aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const string json_metadata = oss.str(); 399aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai Event event; 400aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai event.set_wall_time(static_cast<double>(Env::Default()->NowMicros())); 401aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai LogMessage* log_message = event.mutable_log_message(); 402aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai log_message->set_message(json_metadata); 403aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 404aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai Status status; 405aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai for (const string& url : debug_urls) { 406aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) { 40741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 408aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai Event grpc_event; 409aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 410aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai // Determine the path (if any) in the grpc:// URL, and add it as a field 411aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai // of the JSON string. 412aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const string address = url.substr(strlen(DebugIO::kFileURLScheme)); 413aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const string path = address.find("/") == string::npos 414aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai ? "" 415aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai : address.substr(address.find("/")); 416aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai grpc_event.set_wall_time(event.wall_time()); 417aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai LogMessage* log_message_grpc = grpc_event.mutable_log_message(); 418aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai log_message_grpc->set_message( 419aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai strings::StrCat(json_metadata.substr(0, json_metadata.size() - 1), 420aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai ",\"grpc_path\":\"", path, "\"}")); 421aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 422aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai status.Update( 423aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai DebugGrpcIO::SendEventProtoThroughGrpcStream(grpc_event, url)); 424d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else 42541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR; 426d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif 427aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } else if (str_util::Lowercase(url).find(kFileURLScheme) == 0) { 428aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const string dump_root_dir = url.substr(strlen(kFileURLScheme)); 4299142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai const string core_metadata_path = AppendTimestampToFilePath( 4309142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai io::JoinPath( 4319142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai dump_root_dir, 432cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai strings::StrCat(DebugIO::kMetadataFilePrefix, 433cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai DebugIO::kCoreMetadataTag, "sessionrun", 434258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai strings::Printf("%.14lld", session_run_index))), 4359142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai Env::Default()->NowMicros()); 4369142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai status.Update(DebugFileIO::DumpEventProtoToFile( 4379142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai event, io::Dirname(core_metadata_path).ToString(), 4389142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai io::Basename(core_metadata_path).ToString())); 439aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 440aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai } 441aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 442aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai return status; 443aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai} 444aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai 445aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai// static 446258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key, 447258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const Tensor& tensor, 4489ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai const uint64 wall_time_us, 449ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai const gtl::ArraySlice<string>& debug_urls, 450ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai const bool gated_grpc) { 4518b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai int32 num_failed_urls = 0; 452ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai std::vector<Status> fail_statuses; 4539ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai for (const string& url : debug_urls) { 4549ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (str_util::Lowercase(url).find(kFileURLScheme) == 0) { 4559ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai const string dump_root_dir = url.substr(strlen(kFileURLScheme)); 4569ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 457258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai Status s = DebugFileIO::DumpTensorToDir( 458258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai debug_node_key, tensor, wall_time_us, dump_root_dir, nullptr); 4599ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (!s.ok()) { 4609ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai num_failed_urls++; 461ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai fail_statuses.push_back(s); 4629ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 4639ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } else if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) { 46441803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 465ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai Status s = DebugGrpcIO::SendTensorThroughGrpcStream( 466258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai debug_node_key, tensor, wall_time_us, url, gated_grpc); 467ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 468ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (!s.ok()) { 469ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai num_failed_urls++; 470ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai fail_statuses.push_back(s); 471ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 472d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else 47341803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR; 474d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif 4759ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } else { 4769ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status(error::UNAVAILABLE, 4779ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai strings::StrCat("Invalid debug target URL: ", url)); 4789ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 4799ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 4809ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 4819ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (num_failed_urls == 0) { 4829ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status::OK(); 4839ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } else { 484ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai string error_message = strings::StrCat( 485ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai "Publishing to ", num_failed_urls, " of ", debug_urls.size(), 486ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai " debug target URLs failed, due to the following errors:"); 487ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai for (Status& status : fail_statuses) { 488ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai error_message = 489ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai strings::StrCat(error_message, " ", status.error_message(), ";"); 490ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 491ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 492ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status(error::INTERNAL, error_message); 4939ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 4949ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} 4959ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 4964a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai// static 497258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key, 498258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const Tensor& tensor, 499ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai const uint64 wall_time_us, 500ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai const gtl::ArraySlice<string>& debug_urls) { 501258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai return PublishDebugTensor(debug_node_key, tensor, wall_time_us, debug_urls, 502258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai false); 503ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 504ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 505ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static 50612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing CaiStatus DebugIO::PublishGraph(const Graph& graph, const string& device_name, 5074a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai const std::unordered_set<string>& debug_urls) { 5084a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai GraphDef graph_def; 5094a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai graph.ToGraphDef(&graph_def); 5104a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 5114a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai string buf; 5124a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai graph_def.SerializeToString(&buf); 5134a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 5144a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai const int64 now_micros = Env::Default()->NowMicros(); 5154a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai Event event; 5164a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai event.set_wall_time(static_cast<double>(now_micros)); 5174a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai event.set_graph_def(buf); 5184a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 5194a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai Status status = Status::OK(); 5204a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai for (const string& debug_url : debug_urls) { 5214a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai if (debug_url.find(kFileURLScheme) == 0) { 522cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai const string dump_root_dir = 523cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai io::JoinPath(debug_url.substr(strlen(kFileURLScheme)), 524cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai DebugNodeKey::DeviceNameToDevicePath(device_name)); 525525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai const uint64 graph_hash = ::tensorflow::Hash64(buf); 526525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai const string file_name = 527525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai strings::StrCat(DebugIO::kMetadataFilePrefix, DebugIO::kGraphTag, 528525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai DebugIO::kHashTag, graph_hash, "_", now_micros); 5294a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 5304a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai status.Update( 5314a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name)); 5324a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai } else if (debug_url.find(kGrpcURLScheme) == 0) { 53341803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 53412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai status.Update(PublishEncodedGraphDefInChunks(buf, device_name, now_micros, 53512ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai debug_url)); 536d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else 53741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR; 538d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif 5394a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai } 5404a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai } 5414a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 5424a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai return status; 5434a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai} 5444a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 5454a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai// static 546617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Caibool DebugIO::IsCopyNodeGateOpen( 547617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai const std::vector<DebugWatchAndURLSpec>& specs) { 54841803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 549617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai for (const DebugWatchAndURLSpec& spec : specs) { 550617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai if (!spec.gated_grpc || spec.url.compare(0, strlen(DebugIO::kGrpcURLScheme), 551617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai DebugIO::kGrpcURLScheme)) { 552617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai return true; 553617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai } else { 5543c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (DebugGrpcIO::IsReadGateOpen(spec.url, spec.watch_key)) { 555617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai return true; 556617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai } 557617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai } 558617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai } 559617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai return false; 560617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai#else 561617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai return true; 562617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai#endif 563617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai} 564617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai 565617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai// static 566ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Caibool DebugIO::IsDebugNodeGateOpen(const string& watch_key, 567ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai const std::vector<string>& debug_urls) { 56841803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 569ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai for (const string& debug_url : debug_urls) { 570617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai if (debug_url.compare(0, strlen(DebugIO::kGrpcURLScheme), 571617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai DebugIO::kGrpcURLScheme)) { 572ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return true; 573ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } else { 5743c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (DebugGrpcIO::IsReadGateOpen(debug_url, watch_key)) { 575ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return true; 576ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 577ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 578ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 579ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return false; 580ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#else 581ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return true; 582ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#endif 583ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 584ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 585ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static 586ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Caibool DebugIO::IsDebugURLGateOpen(const string& watch_key, 587ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai const string& debug_url) { 58841803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 589ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai if (debug_url.find(kGrpcURLScheme) != 0) { 590ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return true; 591ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } else { 5923c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return DebugGrpcIO::IsReadGateOpen(debug_url, watch_key); 593ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 594ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#else 595ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return true; 596ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#endif 597ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 598ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 599ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static 600ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing CaiStatus DebugIO::CloseDebugURL(const string& debug_url) { 601ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (debug_url.find(DebugIO::kGrpcURLScheme) == 0) { 60241803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 603aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai return DebugGrpcIO::CloseGrpcStream(debug_url); 604d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else 60541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR; 606d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif 607ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } else { 608ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai // No-op for non-gRPC URLs. 609ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status::OK(); 610ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 611ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 612ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 613ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai// static 614ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Caistatic Status CloseDebugURL(const string& debug_url) { return Status::OK(); } 615ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 6169ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai// static 617258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key, 618258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const Tensor& tensor, 619258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const uint64 wall_time_us, 620258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const string& dump_root_dir, 621258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai string* dump_file_path) { 622258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const string file_path = 623258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai GetDumpFilePath(dump_root_dir, debug_node_key, wall_time_us); 6249ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6259ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (dump_file_path != nullptr) { 6269ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai *dump_file_path = file_path; 6279ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 6289ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 629258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai return DumpTensorToEventFile(debug_node_key, tensor, wall_time_us, file_path); 6309ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} 6319ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6329ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai// static 6339ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caistring DebugFileIO::GetDumpFilePath(const string& dump_root_dir, 634258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const DebugNodeKey& debug_node_key, 6359ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai const uint64 wall_time_us) { 6369142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai return AppendTimestampToFilePath( 637cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai io::JoinPath(dump_root_dir, debug_node_key.device_path, 638258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai strings::StrCat(debug_node_key.node_name, "_", 639258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai debug_node_key.output_slot, "_", 640258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai debug_node_key.debug_op)), 6419142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai wall_time_us); 6429ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} 6439ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6449ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai// static 6454a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing CaiStatus DebugFileIO::DumpEventProtoToFile(const Event& event_proto, 6464a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai const string& dir_name, 6474a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai const string& file_name) { 6489ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai Env* env(Env::Default()); 6499ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6504a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai Status s = RecursiveCreateDir(env, dir_name); 6519ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (!s.ok()) { 6529ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status(error::FAILED_PRECONDITION, 6534a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai strings::StrCat("Failed to create directory ", dir_name, 6549ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai ", due to: ", s.error_message())); 6559ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 6569ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6574a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai const string file_path = io::JoinPath(dir_name, file_name); 6589ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6599ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai string event_str; 6604a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai event_proto.SerializeToString(&event_str); 6619ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6629ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai std::unique_ptr<WritableFile> f = nullptr; 6639ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai TF_CHECK_OK(env->NewWritableFile(file_path, &f)); 664bc225bfaa534acc25047fe844f19edc333b7a76aPeter Hawkins f->Append(event_str).IgnoreError(); 6659ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai TF_CHECK_OK(f->Close()); 6669ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6679ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status::OK(); 6689ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} 6699ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6709ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai// static 671258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugFileIO::DumpTensorToEventFile(const DebugNodeKey& debug_node_key, 672258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const Tensor& tensor, 673258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const uint64 wall_time_us, 674258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const string& file_path) { 6753f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai std::vector<Event> events; 6763f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai TF_RETURN_IF_ERROR( 6773f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai WrapTensorAsEvents(debug_node_key, tensor, wall_time_us, 0, &events)); 6783f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return DumpEventProtoToFile(events[0], io::Dirname(file_path).ToString(), 6793f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai io::Basename(file_path).ToString()); 6804a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai} 6814a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 6824a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai// static 6839ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiStatus DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) { 684879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu if (env->FileExists(dir).ok() && env->IsDirectory(dir).ok()) { 6859ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // The path already exists as a directory. Return OK right away. 6869ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status::OK(); 6879ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 6889ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 6899ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai string parent_dir = io::Dirname(dir).ToString(); 690879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu if (!env->FileExists(parent_dir).ok()) { 6919ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // The parent path does not exist yet, create it first. 6929ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai Status s = RecursiveCreateDir(env, parent_dir); // Recursive call 6939ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai if (!s.ok()) { 6949ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status( 6959ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai error::FAILED_PRECONDITION, 6969ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai strings::StrCat("Failed to create directory ", parent_dir)); 6979ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 698879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu } else if (env->FileExists(parent_dir).ok() && 6999ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai !env->IsDirectory(parent_dir).ok()) { 7009ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // The path exists, but it is a file. 7019ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status(error::FAILED_PRECONDITION, 7029ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai strings::StrCat("Failed to create directory ", parent_dir, 7039ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai " because the path exists as a file ")); 7049ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 7059ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 706bc225bfaa534acc25047fe844f19edc333b7a76aPeter Hawkins env->CreateDir(dir).IgnoreError(); 7079ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // Guard against potential race in creating directories by doing a check 7089ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai // after the CreateDir call. 709879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu if (env->FileExists(dir).ok() && env->IsDirectory(dir).ok()) { 7109ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status::OK(); 7119ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } else { 7129ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai return Status(error::ABORTED, 7139ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai strings::StrCat("Failed to create directory ", parent_dir)); 7149ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai } 7159ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} 7169ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai 71741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS 718ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing CaiDebugGrpcChannel::DebugGrpcChannel(const string& server_stream_addr) 7198b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai : server_stream_addr_(server_stream_addr), 7208b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai url_(strings::StrCat(DebugIO::kGrpcURLScheme, server_stream_addr)) {} 7218b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai 7228b219918214f779b0f4c7785ae93feffa6e492c3Shanqing CaiStatus DebugGrpcChannel::Connect(const int64 timeout_micros) { 7238b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai ::grpc::ChannelArguments args; 7248b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max()); 7258b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai // Avoid problems where default reconnect backoff is too long (e.g., 20 s). 7268b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000); 72759ae0c0f9ac654bd668fb633feef3dbe26bae8eeShanqing Cai channel_ = ::grpc::CreateCustomChannel( 72859ae0c0f9ac654bd668fb633feef3dbe26bae8eeShanqing Cai server_stream_addr_, ::grpc::InsecureChannelCredentials(), args); 7298b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai if (!channel_->WaitForConnected( 7308b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai gpr_time_add(gpr_now(GPR_CLOCK_REALTIME), 7318b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai gpr_time_from_micros(timeout_micros, GPR_TIMESPAN)))) { 7328b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai return errors::FailedPrecondition( 7338b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai "Failed to connect to gRPC channel at ", server_stream_addr_, 7348b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai " within a timeout of ", timeout_micros / 1e6, " s."); 7358b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai } 7368b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai stub_ = EventListener::NewStub(channel_); 7378b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai reader_writer_ = stub_->SendEvents(&ctx_); 738258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai 7398b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai return Status::OK(); 740ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 741ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 742ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Caibool DebugGrpcChannel::WriteEvent(const Event& event) { 743ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai mutex_lock l(mu_); 744ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 745ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return reader_writer_->Write(event); 746ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 747ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 7483c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugGrpcChannel::ReadEventReply(EventReply* event_reply) { 7493c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai mutex_lock l(mu_); 7503c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai 7513c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return reader_writer_->Read(event_reply); 7523c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai} 7533c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai 754ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing CaiStatus DebugGrpcChannel::ReceiveServerRepliesAndClose() { 755ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai mutex_lock l(mu_); 756ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 757ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai reader_writer_->WritesDone(); 758ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 759ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai // Read all EventReply messages (if any) from the server. 760ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai EventReply event_reply; 761ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai while (reader_writer_->Read(&event_reply)) { 762ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai for (const EventReply::DebugOpStateChange& debug_op_state_change : 763ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai event_reply.debug_op_state_changes()) { 764ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai string watch_key = strings::StrCat(debug_op_state_change.node_name(), ":", 765ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai debug_op_state_change.output_slot(), 766ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai ":", debug_op_state_change.debug_op()); 7673c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai DebugGrpcIO::SetDebugNodeKeyGrpcState(url_, watch_key, 7683c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai debug_op_state_change.state()); 769ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 770ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 771ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 772ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (reader_writer_->Finish().ok()) { 773ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status::OK(); 774ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } else { 775ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status(error::FAILED_PRECONDITION, 776ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai "Failed to close debug GRPC stream."); 777ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 778ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 779ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 780ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai// static 781ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Caimutex DebugGrpcIO::streams_mu; 782ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 7838b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai// static 7848b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Caiint64 DebugGrpcIO::channel_connection_timeout_micros = 900 * 1000 * 1000; 7858b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai// TODO(cais): Make this configurable? 7868b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai 7878b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai// static 7883f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst size_t DebugGrpcIO::kGrpcMessageSizeLimitBytes = 4000 * 1024; 7893f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 7903f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// static 7913f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst size_t DebugGrpcIO::kGrpcMaxVarintLengthSize = 6; 7923f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai 7933f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// static 794ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Caistd::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>* 795ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing CaiDebugGrpcIO::GetStreamChannels() { 796ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai static std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>* 797ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai stream_channels = 798ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai new std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>(); 799ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return stream_channels; 800ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 801ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 802ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static 803ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing CaiStatus DebugGrpcIO::SendTensorThroughGrpcStream( 804258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const DebugNodeKey& debug_node_key, const Tensor& tensor, 805258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const uint64 wall_time_us, const string& grpc_stream_url, 806258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai const bool gated) { 8073c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (gated && 8083c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai !IsReadGateOpen(grpc_stream_url, debug_node_key.debug_node_name)) { 809ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return Status::OK(); 810ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } else { 8113f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai std::vector<Event> events; 8123f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai TF_RETURN_IF_ERROR(WrapTensorAsEvents(debug_node_key, tensor, wall_time_us, 8133f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai kGrpcMessageSizeLimitBytes, &events)); 8143f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai for (const Event& event : events) { 8153f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai TF_RETURN_IF_ERROR( 8163f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai SendEventProtoThroughGrpcStream(event, grpc_stream_url)); 8173f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai } 8183c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (IsWriteGateOpen(grpc_stream_url, debug_node_key.debug_node_name)) { 8193c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai EventReply event_reply; 8203c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai TF_RETURN_IF_ERROR(ReceiveEventReplyProtoThroughGrpcStream( 8213c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai &event_reply, grpc_stream_url)); 8223c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai // TODO(cais): Support new tensor value carried in the EventReply for 8233c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai // overriding the value of the tensor being published. 8243c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai } 8253c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return Status::OK(); 8263c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai } 8273c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai} 8283c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai 8293c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai// static 8303c482c66b5a1f74875969e96834ff7564e829668Shanqing CaiStatus DebugGrpcIO::ReceiveEventReplyProtoThroughGrpcStream( 8313c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai EventReply* event_reply, const string& grpc_stream_url) { 8323c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai std::shared_ptr<DebugGrpcChannel> debug_grpc_channel; 8333c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai { 8343c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai mutex_lock l(streams_mu); 8353c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>* 8363c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai stream_channels = GetStreamChannels(); 8373c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai debug_grpc_channel = (*stream_channels)[grpc_stream_url]; 8383c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai } 8393c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (debug_grpc_channel->ReadEventReply(event_reply)) { 8403f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai return Status::OK(); 8413c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai } else { 8423c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return errors::Cancelled(strings::StrCat( 8433c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai "Reading EventReply from stream URL ", grpc_stream_url, " failed.")); 844ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 8454a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai} 8464a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai 8474a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai// static 8484a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing CaiStatus DebugGrpcIO::SendEventProtoThroughGrpcStream( 849aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const Event& event_proto, const string& grpc_stream_url) { 850aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const string addr_with_path = 851258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai grpc_stream_url.find(DebugIO::kGrpcURLScheme) == 0 852258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai ? grpc_stream_url.substr(strlen(DebugIO::kGrpcURLScheme)) 853258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai : grpc_stream_url; 854aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai const string server_stream_addr = 855aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai addr_with_path.substr(0, addr_with_path.find('/')); 856ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai std::shared_ptr<DebugGrpcChannel> debug_grpc_channel; 857ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai { 858ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai mutex_lock l(streams_mu); 859ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>* 860ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai stream_channels = GetStreamChannels(); 861ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai if (stream_channels->find(grpc_stream_url) == stream_channels->end()) { 862ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai debug_grpc_channel.reset(new DebugGrpcChannel(server_stream_addr)); 8638b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai TF_RETURN_IF_ERROR( 8648b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai debug_grpc_channel->Connect(channel_connection_timeout_micros)); 865ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai (*stream_channels)[grpc_stream_url] = debug_grpc_channel; 866ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } else { 867ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai debug_grpc_channel = (*stream_channels)[grpc_stream_url]; 868ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 869ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 870ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 8714a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai bool write_ok = debug_grpc_channel->WriteEvent(event_proto); 872ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai if (!write_ok) { 873ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return errors::Cancelled(strings::StrCat("Write event to stream URL ", 8748b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai grpc_stream_url, " failed.")); 875ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 876ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 877ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status::OK(); 878ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 879ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 8803c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugGrpcIO::IsReadGateOpen(const string& grpc_debug_url, 8813c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const string& watch_key) { 8823c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const DebugNodeName2State* enabled_node_to_state = 8833c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai GetEnabledDebugOpStatesAtUrl(grpc_debug_url); 8843c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return enabled_node_to_state->find(watch_key) != enabled_node_to_state->end(); 8853c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai} 8863c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai 8873c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugGrpcIO::IsWriteGateOpen(const string& grpc_debug_url, 8883c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const string& watch_key) { 8893c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const DebugNodeName2State* enabled_node_to_state = 8903c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai GetEnabledDebugOpStatesAtUrl(grpc_debug_url); 8913c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai auto it = enabled_node_to_state->find(watch_key); 8923c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (it == enabled_node_to_state->end()) { 893ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai return false; 894ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } else { 8953c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return it->second == EventReply::DebugOpStateChange::READ_WRITE; 896ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 897ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 898ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 899ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static 900aabc7972b94af5a678550427534d4fba7fda327cShanqing CaiStatus DebugGrpcIO::CloseGrpcStream(const string& grpc_stream_url) { 901ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai mutex_lock l(streams_mu); 902ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 903ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>* 904ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai stream_channels = GetStreamChannels(); 905ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai if (stream_channels->find(grpc_stream_url) != stream_channels->end()) { 906ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai // Stream of the specified address exists. Close it and remove it from 907ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai // record. 908ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai Status s; 909ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai s = (*stream_channels)[grpc_stream_url]->ReceiveServerRepliesAndClose(); 910ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai (*stream_channels).erase(grpc_stream_url); 911ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return s; 912ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } else { 913ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai // Stream of the specified address does not exist. No action. 914ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai return Status::OK(); 915ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai } 916ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai} 917ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 918ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static 9193c482c66b5a1f74875969e96834ff7564e829668Shanqing Caistd::unordered_map<string, DebugGrpcIO::DebugNodeName2State>* 9203c482c66b5a1f74875969e96834ff7564e829668Shanqing CaiDebugGrpcIO::GetEnabledDebugOpStates() { 9213c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai static std::unordered_map<string, DebugNodeName2State>* 9223c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai enabled_debug_op_states = 9233c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai new std::unordered_map<string, DebugNodeName2State>(); 9243c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return enabled_debug_op_states; 925ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 926ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 927ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static 9283c482c66b5a1f74875969e96834ff7564e829668Shanqing CaiDebugGrpcIO::DebugNodeName2State* DebugGrpcIO::GetEnabledDebugOpStatesAtUrl( 9293c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const string& grpc_debug_url) { 930c1f69be22e151e2d051f41fccf436767eee4a26aShanqing Cai static mutex* debug_ops_state_mu = new mutex(); 9313c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai std::unordered_map<string, DebugNodeName2State>* states = 9323c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai GetEnabledDebugOpStates(); 933c1f69be22e151e2d051f41fccf436767eee4a26aShanqing Cai 934c1f69be22e151e2d051f41fccf436767eee4a26aShanqing Cai mutex_lock l(*debug_ops_state_mu); 9353c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (states->find(grpc_debug_url) == states->end()) { 9363c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai DebugNodeName2State url_enabled_debug_op_states; 9373c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai (*states)[grpc_debug_url] = url_enabled_debug_op_states; 938ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 9393c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai return &(*states)[grpc_debug_url]; 940ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 941ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 942ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static 9433c482c66b5a1f74875969e96834ff7564e829668Shanqing Caivoid DebugGrpcIO::SetDebugNodeKeyGrpcState( 9443c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const string& grpc_debug_url, const string& watch_key, 9453c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai const EventReply::DebugOpStateChange::State new_state) { 9463c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai DebugNodeName2State* states = GetEnabledDebugOpStatesAtUrl(grpc_debug_url); 9473c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (new_state == EventReply::DebugOpStateChange::DISABLED) { 9483c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai if (states->find(watch_key) == states->end()) { 9493c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai LOG(ERROR) << "Attempt to disable a watch key that is not currently " 9503c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai << "enabled at " << grpc_debug_url << ": " << watch_key; 951ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } else { 9523c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai states->erase(watch_key); 953ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 9543c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai } else if (new_state != EventReply::DebugOpStateChange::STATE_UNSPECIFIED) { 9553c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai (*states)[watch_key] = new_state; 956ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai } 957ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 958ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 959ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static 9603c482c66b5a1f74875969e96834ff7564e829668Shanqing Caivoid DebugGrpcIO::ClearEnabledWatchKeys() { 9613c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai GetEnabledDebugOpStates()->clear(); 962ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai} 963ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai 96441803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#endif // #ifndef PLATFORM_WINDOWS 965ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai 9669ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai} // namespace tensorflow 967