debug_io_utils.cc revision c1f69be22e151e2d051f41fccf436767eee4a26a
19ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
29ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
39ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiLicensed under the Apache License, Version 2.0 (the "License");
49ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caiyou may not use this file except in compliance with the License.
59ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiYou may obtain a copy of the License at
69ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
79ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    http://www.apache.org/licenses/LICENSE-2.0
89ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
99ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiUnless required by applicable law or agreed to in writing, software
109ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caidistributed under the License is distributed on an "AS IS" BASIS,
119ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
129ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiSee the License for the specific language governing permissions and
139ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cailimitations under the License.
149ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai==============================================================================*/
159ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
169ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/debug/debug_io_utils.h"
179ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
183c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <stddef.h>
193c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <string.h>
203c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <cmath>
213c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <limits>
223c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <utility>
239ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include <vector>
249ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
2541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
26ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai#include "grpc++/create_channel.h"
2741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#else
281cb96893a64f59b7265f9def9968f7bed1e57662Andrew Harp// winsock2.h is used in grpc, so Ws2_32.lib is needed
291cb96893a64f59b7265f9def9968f7bed1e57662Andrew Harp#pragma comment(lib,"Ws2_32.lib")
3041803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#endif  // #ifndef PLATFORM_WINDOWS
311cb96893a64f59b7265f9def9968f7bed1e57662Andrew Harp
321e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower#include "tensorflow/core/debug/debugger_event_metadata.pb.h"
33e85d3df92deb9d717befdf173966a2913ac2aea0Geoffrey Irving#include "tensorflow/core/framework/graph.pb.h"
349ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/framework/summary.pb.h"
353f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#include "tensorflow/core/lib/core/bits.h"
3612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai#include "tensorflow/core/lib/hash/hash.h"
379ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/lib/io/path.h"
389ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/lib/strings/str_util.h"
399142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai#include "tensorflow/core/lib/strings/stringprintf.h"
401e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower#include "tensorflow/core/platform/protobuf.h"
419ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/util/event.pb.h"
429ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
4341803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#define GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR \
4441803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai  return errors::Unimplemented(              \
4541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai      kGrpcURLScheme, " debug URL scheme is not implemented on Windows yet.")
46d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai
479ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cainamespace tensorflow {
489ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
499ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cainamespace {
509ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
513f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Creates an Event proto representing a chunk of a Tensor. This method only
523f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// populates the field of the Event proto that represent the envelope
533f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// informaion (e.g., timestmap, device_name, num_chunks, chunk_index, dtype,
543f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// shape). It does not set the value.tensor field, which should be set by the
553f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// caller separately.
563f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing CaiEvent PrepareChunkEventProto(const DebugNodeKey& debug_node_key,
573f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                             const uint64 wall_time_us, const size_t num_chunks,
583f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                             const size_t chunk_index,
593f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                             const DataType& tensor_dtype,
603f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                             const TensorShapeProto& tensor_shape) {
619ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  Event event;
629ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  event.set_wall_time(static_cast<double>(wall_time_us));
633f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  Summary::Value* value = event.mutable_summary()->add_value();
649ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
659ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  // Create the debug node_name in the Summary proto.
669ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  // For example, if tensor_name = "foo/node_a:0", and the debug_op is
679ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  // "DebugIdentity", the debug node_name in the Summary proto will be
689ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  // "foo/node_a:0:DebugIdentity".
693f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  value->set_node_name(debug_node_key.debug_node_name);
701e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower
713f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  // Tag by the node name. This allows TensorBoard to quickly fetch data
723f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  // per op.
733f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  value->set_tag(debug_node_key.node_name);
741e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower
751e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  // Store data within debugger metadata to be stored for each event.
761e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  third_party::tensorflow::core::debug::DebuggerEventMetadata metadata;
771e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  metadata.set_device(debug_node_key.device_name);
781e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  metadata.set_output_slot(debug_node_key.output_slot);
793f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  metadata.set_num_chunks(num_chunks);
803f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  metadata.set_chunk_index(chunk_index);
811e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower
821e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  // Encode the data in JSON.
831e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  string json_output;
841e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  tensorflow::protobuf::util::JsonPrintOptions json_options;
851e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  json_options.always_print_primitive_fields = true;
861e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  auto status = tensorflow::protobuf::util::MessageToJsonString(
871e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower      metadata, &json_output, json_options);
881e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  if (status.ok()) {
891e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower    // Store summary metadata. Set the plugin to use this data as "debugger".
901e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower    SummaryMetadata::PluginData* plugin_data =
914c60c96257bfd54a036d15af979e90fc0b4e400dA. Unique TensorFlower        value->mutable_metadata()->mutable_plugin_data();
923f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    plugin_data->set_plugin_name(DebugIO::kDebuggerPluginName);
931e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower    plugin_data->set_content(json_output);
941e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  } else {
951e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower    LOG(WARNING) << "Failed to convert DebuggerEventMetadata proto to JSON. "
961e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower                 << "The debug_node_name is " << debug_node_key.debug_node_name
971e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower                 << ".";
981e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  }
999ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
1003f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  value->mutable_tensor()->set_dtype(tensor_dtype);
1013f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  *value->mutable_tensor()->mutable_tensor_shape() = tensor_shape;
1023f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1033f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  return event;
1043f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai}
1053f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1063f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Translates the length of a string to number of bytes when the string is
1073f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// encoded as bytes in protobuf. Note that this makes a conservative estimate
1083f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// (i.e., an estimate that is usually too large, but never too small under the
1093f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// gRPC message size limit) of the Varint-encoded length, to workaround the lack
1103f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// of a portable length function.
1113f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst size_t StringValMaxBytesInProto(const string& str) {
1123f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#if defined(PLATFORM_GOOGLE)
1133f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  return str.size() + DebugGrpcIO::kGrpcMaxVarintLengthSize;
1143f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#else
1153f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  return str.size();
1163f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#endif
1173f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai}
1183f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1193f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Breaks a string Tensor (represented as a TensorProto) as a vector of Event
1203f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// protos.
1213f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing CaiStatus WrapStringTensorAsEvents(const DebugNodeKey& debug_node_key,
1223f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                const uint64 wall_time_us,
1233f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                const size_t chunk_size_limit,
1243f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                TensorProto* tensor_proto,
1253f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                std::vector<Event>* events) {
1263f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  const protobuf::RepeatedPtrField<string>& strs = tensor_proto->string_val();
1273f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  const size_t num_strs = strs.size();
1283f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  const size_t chunk_size_ub = chunk_size_limit > 0
1293f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                   ? chunk_size_limit
1303f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                   : std::numeric_limits<size_t>::max();
1313f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1323f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  // E.g., if cutoffs is {j, k, l}, the chunks will have index ranges:
1333f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  //   [0:a), [a:b), [c:<end>].
1343f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  std::vector<size_t> cutoffs;
1353f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  size_t chunk_size = 0;
1363f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  for (size_t i = 0; i < num_strs; ++i) {
1373f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    // Take into account the extra bytes in proto buffer.
1383f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    if (StringValMaxBytesInProto(strs[i]) > chunk_size_ub) {
1393f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      return errors::FailedPrecondition(
1403f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          "string value at index ", i, " from debug node ",
1413f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          debug_node_key.debug_node_name,
1423f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          " does not fit gRPC message size limit (", chunk_size_ub, ")");
1433f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    }
1443f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    if (chunk_size + StringValMaxBytesInProto(strs[i]) > chunk_size_ub) {
1453f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      cutoffs.push_back(i);
1463f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      chunk_size = 0;
1473f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    }
1483f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    chunk_size += StringValMaxBytesInProto(strs[i]);
1493f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  }
1503f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  cutoffs.push_back(num_strs);
1513f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  const size_t num_chunks = cutoffs.size();
1523f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1533f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  for (size_t i = 0; i < num_chunks; ++i) {
1543f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    Event event = PrepareChunkEventProto(debug_node_key, wall_time_us,
1553f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                         num_chunks, i, tensor_proto->dtype(),
1563f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                         tensor_proto->tensor_shape());
1573f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    Summary::Value* value = event.mutable_summary()->mutable_value(0);
1583f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1593f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    if (cutoffs.size() == 1) {
1603f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      value->mutable_tensor()->mutable_string_val()->Swap(
1613f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          tensor_proto->mutable_string_val());
1623f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    } else {
1633f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      const size_t begin = (i == 0) ? 0 : cutoffs[i - 1];
1643f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      const size_t end = cutoffs[i];
1653f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      for (size_t j = begin; j < end; ++j) {
1663f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai        value->mutable_tensor()->add_string_val(strs[j]);
1673f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      }
1683f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    }
1693f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1703f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    events->push_back(std::move(event));
1713f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  }
1723f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1733f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  return Status::OK();
1743f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai}
1753f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1763f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Encapsulates the tensor value inside a vector of Event protos. Large tensors
1773f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// are broken up to multiple protos to fit the chunk_size_limit. In each Event
1783f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// proto the field summary.tensor carries the content of the tensor.
1793f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// If chunk_size_limit <= 0, the tensor will not be broken into chunks, i.e., a
1803f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// length-1 vector will be returned, regardless of the size of the tensor.
1813f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing CaiStatus WrapTensorAsEvents(const DebugNodeKey& debug_node_key,
1823f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                          const Tensor& tensor, const uint64 wall_time_us,
1833f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                          const size_t chunk_size_limit,
1843f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                          std::vector<Event>* events) {
1853f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  TensorProto tensor_proto;
1869ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  if (tensor.dtype() == DT_STRING) {
1873f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    // Treat DT_STRING specially, so that tensor_util.MakeNdarray in Python can
1883f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    // convert the TensorProto to string-type numpy array. MakeNdarray does not
1893f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    // work with strings encoded by AsProtoTensorContent() in tensor_content.
1903f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    tensor.AsProtoField(&tensor_proto);
1913f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1923f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    TF_RETURN_IF_ERROR(WrapStringTensorAsEvents(
1933f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai        debug_node_key, wall_time_us, chunk_size_limit, &tensor_proto, events));
1949ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  } else {
1953f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    tensor.AsProtoTensorContent(&tensor_proto);
1963f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1973f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    const size_t total_length = tensor_proto.tensor_content().size();
1983f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    const size_t chunk_size_ub =
1993f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai        chunk_size_limit > 0 ? chunk_size_limit : total_length;
2003f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    const size_t num_chunks =
2013f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai        (total_length == 0)
2023f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai            ? 1
2033f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai            : (total_length + chunk_size_ub - 1) / chunk_size_ub;
2043f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    for (size_t i = 0; i < num_chunks; ++i) {
2053f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      const size_t pos = i * chunk_size_ub;
2063f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      const size_t len =
2073f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          (i == num_chunks - 1) ? (total_length - pos) : chunk_size_ub;
2083f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      Event event = PrepareChunkEventProto(debug_node_key, wall_time_us,
2093f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                           num_chunks, i, tensor_proto.dtype(),
2103f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                           tensor_proto.tensor_shape());
2113f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      event.mutable_summary()
2123f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          ->mutable_value(0)
2133f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          ->mutable_tensor()
2143f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          ->set_tensor_content(tensor_proto.tensor_content().substr(pos, len));
2153f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      events->push_back(std::move(event));
2163f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    }
2179ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
2189ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
2193f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  return Status::OK();
2209ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}
2219ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
2223f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Appends an underscore and a timestamp to a file path. If the path already
2239142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// exists on the file system, append a hyphen and a 1-up index. Consecutive
2249142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// values of the index will be tried until the first unused one is found.
2259142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// TOCTOU race condition is not of concern here due to the fact that tfdbg
2269142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// sets parallel_iterations attribute of all while_loops to 1 to prevent
2279142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// the same node from between executed multiple times concurrently.
2289142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Caistring AppendTimestampToFilePath(const string& in, const uint64 timestamp) {
2299142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai  string out = strings::StrCat(in, "_", timestamp);
2309142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai
2319142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai  uint64 i = 1;
2329142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai  while (Env::Default()->FileExists(out).ok()) {
2339142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai    out = strings::StrCat(in, "_", timestamp, "-", i);
2349142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai    ++i;
2359142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai  }
2369142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai  return out;
2379142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai}
2389142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai
23941803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
2403f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Publishes encoded GraphDef through a gRPC debugger stream, in chunks,
2413f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// conforming to the gRPC message size limit.
24212ac2f34fadc8802121382c64588d9f9c2f58390Shanqing CaiStatus PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
24312ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai                                      const string& device_name,
24412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai                                      const int64 wall_time,
24512ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai                                      const string& debug_url) {
24612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai  const uint64 hash = ::tensorflow::Hash64(encoded_graph_def);
24712ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai  const size_t total_length = encoded_graph_def.size();
2483f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  const size_t num_chunks =
2493f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      static_cast<size_t>(std::ceil(static_cast<float>(total_length) /
2503f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                    DebugGrpcIO::kGrpcMessageSizeLimitBytes));
25112ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai  for (size_t i = 0; i < num_chunks; ++i) {
2523f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    const size_t pos = i * DebugGrpcIO::kGrpcMessageSizeLimitBytes;
2533f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    const size_t len = (i == num_chunks - 1)
2543f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                           ? (total_length - pos)
2553f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                           : DebugGrpcIO::kGrpcMessageSizeLimitBytes;
25612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    Event event;
25712ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    event.set_wall_time(static_cast<double>(wall_time));
25812ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    // Prefix the chunk with
25912ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    //   <hash64>,<device_name>,<wall_time>|<index>|<num_chunks>|.
2603f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    // TODO(cais): Use DebuggerEventMetadata to store device_name, num_chunks
2613f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    // and chunk_index, instead.
26212ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    event.set_graph_def(strings::StrCat(hash, ",", device_name, ",", wall_time,
26312ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai                                        "|", i, "|", num_chunks, "|",
26412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai                                        encoded_graph_def.substr(pos, len)));
26512ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    if (!DebugGrpcIO::SendEventProtoThroughGrpcStream(event, debug_url).ok()) {
26612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai      return errors::FailedPrecondition(
26712ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai          "Failed to send chunk ", i, " of ", num_chunks,
26812ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai          " of encoded GraphDef of size ", encoded_graph_def.size(), " bytes");
26912ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    }
27012ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai  }
27112ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai  return Status::OK();
27212ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai}
27341803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#endif  // #ifndef PLATFORM_WINDOWS
27412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai
2759ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}  // namespace
2769ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
277cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai// static
2783f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst char* const DebugIO::kDebuggerPluginName = "debugger";
2793f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
2803f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// static
281cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst char* const DebugIO::kMetadataFilePrefix = "_tfdbg_";
282cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai
283cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai// static
284cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst char* const DebugIO::kCoreMetadataTag = "core_metadata_";
285cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai
286cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai// static
287cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst char* const DebugIO::kDeviceTag = "device_";
288cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai
289cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai// static
290cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst char* const DebugIO::kGraphTag = "graph_";
291cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai
292525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai// static
293525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Caiconst char* const DebugIO::kHashTag = "hash";
294525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai
295258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiDebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
296258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                           const int32 output_slot, const string& debug_op)
297258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai    : device_name(device_name),
298258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai      node_name(node_name),
299258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai      output_slot(output_slot),
300258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai      debug_op(debug_op),
301258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai      debug_node_name(
302cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai          strings::StrCat(node_name, ":", output_slot, ":", debug_op)),
303cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai      device_path(DeviceNameToDevicePath(device_name)) {}
304258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai
3053c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugNodeKey::operator==(const DebugNodeKey& other) const {
3063c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  return (device_name == other.device_name && node_name == other.node_name &&
3073c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai          output_slot == other.output_slot && debug_op == other.debug_op);
3083c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai}
3093c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai
3103c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugNodeKey::operator!=(const DebugNodeKey& other) const {
3113c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  return !((*this) == other);
3123c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai}
3133c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai
314ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing CaiStatus ReadEventFromFile(const string& dump_file_path, Event* event) {
315ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  Env* env(Env::Default());
316ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
317ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  string content;
318ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  uint64 file_size = 0;
319ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
320ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  Status s = env->GetFileSize(dump_file_path, &file_size);
321ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  if (!s.ok()) {
322ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return s;
323ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
324ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
325ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  content.resize(file_size);
326ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
327ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  std::unique_ptr<RandomAccessFile> file;
328ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  s = env->NewRandomAccessFile(dump_file_path, &file);
329ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  if (!s.ok()) {
330ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return s;
331ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
332ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
333ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  StringPiece result;
334ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  s = file->Read(0, file_size, &result, &(content)[0]);
335ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  if (!s.ok()) {
336ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return s;
337ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
338ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
339ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  event->ParseFromString(content);
340ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  return Status::OK();
341ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
342ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
3439ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai// static
344cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst string DebugNodeKey::DeviceNameToDevicePath(const string& device_name) {
345cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai  return strings::StrCat(
346cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai      DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag,
347cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai      str_util::StringReplace(
348cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai          str_util::StringReplace(device_name, ":", "_", true), "/", ",",
349cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai          true));
350cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai}
351cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai
352cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai// static
3539ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caiconst char* const DebugIO::kFileURLScheme = "file://";
3549ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai// static
3559ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caiconst char* const DebugIO::kGrpcURLScheme = "grpc://";
3569ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
3573f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Publishes debug metadata to a set of debug URLs.
3584a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai// static
359aabc7972b94af5a678550427534d4fba7fda327cShanqing CaiStatus DebugIO::PublishDebugMetadata(
360258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai    const int64 global_step, const int64 session_run_index,
361258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai    const int64 executor_step_index, const std::vector<string>& input_names,
362aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    const std::vector<string>& output_names,
363aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    const std::vector<string>& target_nodes,
364aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    const std::unordered_set<string>& debug_urls) {
365aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  std::ostringstream oss;
366aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
367aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  // Construct a JSON string to carry the metadata.
368aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "{";
369aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "\"global_step\":" << global_step << ",";
370258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai  oss << "\"session_run_index\":" << session_run_index << ",";
371258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai  oss << "\"executor_step_index\":" << executor_step_index << ",";
372aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "\"input_names\":[";
373aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  for (size_t i = 0; i < input_names.size(); ++i) {
374aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    oss << "\"" << input_names[i] << "\"";
375aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    if (i < input_names.size() - 1) {
376aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      oss << ",";
377aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    }
378aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  }
379aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "],";
380aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "\"output_names\":[";
381aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  for (size_t i = 0; i < output_names.size(); ++i) {
382aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    oss << "\"" << output_names[i] << "\"";
383aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    if (i < output_names.size() - 1) {
384aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      oss << ",";
385aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    }
386aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  }
387aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "],";
388aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "\"target_nodes\":[";
389aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  for (size_t i = 0; i < target_nodes.size(); ++i) {
390aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    oss << "\"" << target_nodes[i] << "\"";
391aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    if (i < target_nodes.size() - 1) {
392aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      oss << ",";
393aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    }
394aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  }
395aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "]";
396aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "}";
397aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
398aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  const string json_metadata = oss.str();
399aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  Event event;
400aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  event.set_wall_time(static_cast<double>(Env::Default()->NowMicros()));
401aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  LogMessage* log_message = event.mutable_log_message();
402aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  log_message->set_message(json_metadata);
403aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
404aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  Status status;
405aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  for (const string& url : debug_urls) {
406aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) {
40741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
408aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      Event grpc_event;
409aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
410aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      // Determine the path (if any) in the grpc:// URL, and add it as a field
411aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      // of the JSON string.
412aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      const string address = url.substr(strlen(DebugIO::kFileURLScheme));
413aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      const string path = address.find("/") == string::npos
414aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai                              ? ""
415aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai                              : address.substr(address.find("/"));
416aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      grpc_event.set_wall_time(event.wall_time());
417aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      LogMessage* log_message_grpc = grpc_event.mutable_log_message();
418aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      log_message_grpc->set_message(
419aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai          strings::StrCat(json_metadata.substr(0, json_metadata.size() - 1),
420aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai                          ",\"grpc_path\":\"", path, "\"}"));
421aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
422aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      status.Update(
423aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai          DebugGrpcIO::SendEventProtoThroughGrpcStream(grpc_event, url));
424d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else
42541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai      GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
426d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif
427aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    } else if (str_util::Lowercase(url).find(kFileURLScheme) == 0) {
428aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      const string dump_root_dir = url.substr(strlen(kFileURLScheme));
4299142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai      const string core_metadata_path = AppendTimestampToFilePath(
4309142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai          io::JoinPath(
4319142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai              dump_root_dir,
432cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai              strings::StrCat(DebugIO::kMetadataFilePrefix,
433cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai                              DebugIO::kCoreMetadataTag, "sessionrun",
434258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                              strings::Printf("%.14lld", session_run_index))),
4359142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai          Env::Default()->NowMicros());
4369142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai      status.Update(DebugFileIO::DumpEventProtoToFile(
4379142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai          event, io::Dirname(core_metadata_path).ToString(),
4389142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai          io::Basename(core_metadata_path).ToString()));
439aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    }
440aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  }
441aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
442aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  return status;
443aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai}
444aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
445aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai// static
446258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
447258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                   const Tensor& tensor,
4489ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                                   const uint64 wall_time_us,
449ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                   const gtl::ArraySlice<string>& debug_urls,
450ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                   const bool gated_grpc) {
4518b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  int32 num_failed_urls = 0;
452ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  std::vector<Status> fail_statuses;
4539ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  for (const string& url : debug_urls) {
4549ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    if (str_util::Lowercase(url).find(kFileURLScheme) == 0) {
4559ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai      const string dump_root_dir = url.substr(strlen(kFileURLScheme));
4569ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
457258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai      Status s = DebugFileIO::DumpTensorToDir(
458258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai          debug_node_key, tensor, wall_time_us, dump_root_dir, nullptr);
4599ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai      if (!s.ok()) {
4609ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai        num_failed_urls++;
461ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai        fail_statuses.push_back(s);
4629ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai      }
4639ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    } else if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) {
46441803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
465ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai      Status s = DebugGrpcIO::SendTensorThroughGrpcStream(
466258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai          debug_node_key, tensor, wall_time_us, url, gated_grpc);
467ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
468ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai      if (!s.ok()) {
469ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai        num_failed_urls++;
470ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai        fail_statuses.push_back(s);
471ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai      }
472d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else
47341803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai      GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
474d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif
4759ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    } else {
4769ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai      return Status(error::UNAVAILABLE,
4779ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                    strings::StrCat("Invalid debug target URL: ", url));
4789ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    }
4799ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
4809ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
4819ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  if (num_failed_urls == 0) {
4829ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    return Status::OK();
4839ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  } else {
484ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    string error_message = strings::StrCat(
485ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai        "Publishing to ", num_failed_urls, " of ", debug_urls.size(),
486ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai        " debug target URLs failed, due to the following errors:");
487ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    for (Status& status : fail_statuses) {
488ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai      error_message =
489ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai          strings::StrCat(error_message, " ", status.error_message(), ";");
490ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    }
491ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
492ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return Status(error::INTERNAL, error_message);
4939ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
4949ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}
4959ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
4964a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai// static
497258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
498258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                   const Tensor& tensor,
499ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                   const uint64 wall_time_us,
500ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                   const gtl::ArraySlice<string>& debug_urls) {
501258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai  return PublishDebugTensor(debug_node_key, tensor, wall_time_us, debug_urls,
502258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                            false);
503ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
504ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
505ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static
50612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing CaiStatus DebugIO::PublishGraph(const Graph& graph, const string& device_name,
5074a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai                             const std::unordered_set<string>& debug_urls) {
5084a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  GraphDef graph_def;
5094a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  graph.ToGraphDef(&graph_def);
5104a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
5114a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  string buf;
5124a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  graph_def.SerializeToString(&buf);
5134a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
5144a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  const int64 now_micros = Env::Default()->NowMicros();
5154a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  Event event;
5164a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  event.set_wall_time(static_cast<double>(now_micros));
5174a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  event.set_graph_def(buf);
5184a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
5194a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  Status status = Status::OK();
5204a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  for (const string& debug_url : debug_urls) {
5214a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai    if (debug_url.find(kFileURLScheme) == 0) {
522cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai      const string dump_root_dir =
523cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai          io::JoinPath(debug_url.substr(strlen(kFileURLScheme)),
524cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai                       DebugNodeKey::DeviceNameToDevicePath(device_name));
525525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai      const uint64 graph_hash = ::tensorflow::Hash64(buf);
526525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai      const string file_name =
527525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai          strings::StrCat(DebugIO::kMetadataFilePrefix, DebugIO::kGraphTag,
528525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai                          DebugIO::kHashTag, graph_hash, "_", now_micros);
5294a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
5304a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai      status.Update(
5314a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai          DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name));
5324a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai    } else if (debug_url.find(kGrpcURLScheme) == 0) {
53341803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
53412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai      status.Update(PublishEncodedGraphDefInChunks(buf, device_name, now_micros,
53512ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai                                                   debug_url));
536d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else
53741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai      GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
538d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif
5394a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai    }
5404a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  }
5414a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
5424a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  return status;
5434a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai}
5444a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
5454a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai// static
546617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Caibool DebugIO::IsCopyNodeGateOpen(
547617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai    const std::vector<DebugWatchAndURLSpec>& specs) {
54841803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
549617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai  for (const DebugWatchAndURLSpec& spec : specs) {
550617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai    if (!spec.gated_grpc || spec.url.compare(0, strlen(DebugIO::kGrpcURLScheme),
551617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai                                             DebugIO::kGrpcURLScheme)) {
552617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai      return true;
553617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai    } else {
5543c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      if (DebugGrpcIO::IsReadGateOpen(spec.url, spec.watch_key)) {
555617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai        return true;
556617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai      }
557617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai    }
558617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai  }
559617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai  return false;
560617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai#else
561617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai  return true;
562617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai#endif
563617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai}
564617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai
565617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai// static
566ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Caibool DebugIO::IsDebugNodeGateOpen(const string& watch_key,
567ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                  const std::vector<string>& debug_urls) {
56841803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
569ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  for (const string& debug_url : debug_urls) {
570617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai    if (debug_url.compare(0, strlen(DebugIO::kGrpcURLScheme),
571617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai                          DebugIO::kGrpcURLScheme)) {
572ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai      return true;
573ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    } else {
5743c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      if (DebugGrpcIO::IsReadGateOpen(debug_url, watch_key)) {
575ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai        return true;
576ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai      }
577ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    }
578ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
579ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  return false;
580ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#else
581ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  return true;
582ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#endif
583ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
584ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
585ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static
586ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Caibool DebugIO::IsDebugURLGateOpen(const string& watch_key,
587ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                 const string& debug_url) {
58841803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
589ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  if (debug_url.find(kGrpcURLScheme) != 0) {
590ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    return true;
591ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  } else {
5923c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    return DebugGrpcIO::IsReadGateOpen(debug_url, watch_key);
593ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
594ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#else
595ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  return true;
596ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#endif
597ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
598ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
599ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static
600ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing CaiStatus DebugIO::CloseDebugURL(const string& debug_url) {
601ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  if (debug_url.find(DebugIO::kGrpcURLScheme) == 0) {
60241803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
603aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    return DebugGrpcIO::CloseGrpcStream(debug_url);
604d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else
60541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai    GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
606d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif
607ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  } else {
608ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    // No-op for non-gRPC URLs.
609ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return Status::OK();
610ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
611ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
612ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
613ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai// static
614ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Caistatic Status CloseDebugURL(const string& debug_url) { return Status::OK(); }
615ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
6169ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai// static
617258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key,
618258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                    const Tensor& tensor,
619258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                    const uint64 wall_time_us,
620258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                    const string& dump_root_dir,
621258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                    string* dump_file_path) {
622258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai  const string file_path =
623258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai      GetDumpFilePath(dump_root_dir, debug_node_key, wall_time_us);
6249ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6259ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  if (dump_file_path != nullptr) {
6269ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    *dump_file_path = file_path;
6279ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
6289ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
629258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai  return DumpTensorToEventFile(debug_node_key, tensor, wall_time_us, file_path);
6309ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}
6319ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6329ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai// static
6339ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caistring DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
634258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                    const DebugNodeKey& debug_node_key,
6359ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                                    const uint64 wall_time_us) {
6369142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai  return AppendTimestampToFilePath(
637cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai      io::JoinPath(dump_root_dir, debug_node_key.device_path,
638258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                   strings::StrCat(debug_node_key.node_name, "_",
639258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                   debug_node_key.output_slot, "_",
640258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                   debug_node_key.debug_op)),
6419142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai      wall_time_us);
6429ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}
6439ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6449ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai// static
6454a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing CaiStatus DebugFileIO::DumpEventProtoToFile(const Event& event_proto,
6464a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai                                         const string& dir_name,
6474a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai                                         const string& file_name) {
6489ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  Env* env(Env::Default());
6499ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6504a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  Status s = RecursiveCreateDir(env, dir_name);
6519ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  if (!s.ok()) {
6529ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    return Status(error::FAILED_PRECONDITION,
6534a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai                  strings::StrCat("Failed to create directory  ", dir_name,
6549ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                                  ", due to: ", s.error_message()));
6559ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
6569ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6574a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  const string file_path = io::JoinPath(dir_name, file_name);
6589ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6599ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  string event_str;
6604a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  event_proto.SerializeToString(&event_str);
6619ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6629ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  std::unique_ptr<WritableFile> f = nullptr;
6639ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  TF_CHECK_OK(env->NewWritableFile(file_path, &f));
664bc225bfaa534acc25047fe844f19edc333b7a76aPeter Hawkins  f->Append(event_str).IgnoreError();
6659ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  TF_CHECK_OK(f->Close());
6669ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6679ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  return Status::OK();
6689ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}
6699ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6709ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai// static
671258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugFileIO::DumpTensorToEventFile(const DebugNodeKey& debug_node_key,
672258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                          const Tensor& tensor,
673258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                          const uint64 wall_time_us,
674258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                          const string& file_path) {
6753f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  std::vector<Event> events;
6763f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  TF_RETURN_IF_ERROR(
6773f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      WrapTensorAsEvents(debug_node_key, tensor, wall_time_us, 0, &events));
6783f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  return DumpEventProtoToFile(events[0], io::Dirname(file_path).ToString(),
6793f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                              io::Basename(file_path).ToString());
6804a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai}
6814a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
6824a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai// static
6839ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiStatus DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
684879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu  if (env->FileExists(dir).ok() && env->IsDirectory(dir).ok()) {
6859ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    // The path already exists as a directory. Return OK right away.
6869ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    return Status::OK();
6879ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
6889ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6899ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  string parent_dir = io::Dirname(dir).ToString();
690879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu  if (!env->FileExists(parent_dir).ok()) {
6919ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    // The parent path does not exist yet, create it first.
6929ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    Status s = RecursiveCreateDir(env, parent_dir);  // Recursive call
6939ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    if (!s.ok()) {
6949ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai      return Status(
6959ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai          error::FAILED_PRECONDITION,
6969ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai          strings::StrCat("Failed to create directory  ", parent_dir));
6979ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    }
698879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu  } else if (env->FileExists(parent_dir).ok() &&
6999ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai             !env->IsDirectory(parent_dir).ok()) {
7009ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    // The path exists, but it is a file.
7019ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    return Status(error::FAILED_PRECONDITION,
7029ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                  strings::StrCat("Failed to create directory  ", parent_dir,
7039ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                                  " because the path exists as a file "));
7049ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
7059ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
706bc225bfaa534acc25047fe844f19edc333b7a76aPeter Hawkins  env->CreateDir(dir).IgnoreError();
7079ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  // Guard against potential race in creating directories by doing a check
7089ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  // after the CreateDir call.
709879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu  if (env->FileExists(dir).ok() && env->IsDirectory(dir).ok()) {
7109ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    return Status::OK();
7119ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  } else {
7129ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    return Status(error::ABORTED,
7139ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                  strings::StrCat("Failed to create directory  ", parent_dir));
7149ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
7159ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}
7169ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
71741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
718ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing CaiDebugGrpcChannel::DebugGrpcChannel(const string& server_stream_addr)
7198b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai    : server_stream_addr_(server_stream_addr),
7208b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai      url_(strings::StrCat(DebugIO::kGrpcURLScheme, server_stream_addr)) {}
7218b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai
7228b219918214f779b0f4c7785ae93feffa6e492c3Shanqing CaiStatus DebugGrpcChannel::Connect(const int64 timeout_micros) {
7238b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  ::grpc::ChannelArguments args;
7248b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
7258b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  // Avoid problems where default reconnect backoff is too long (e.g., 20 s).
7268b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
72759ae0c0f9ac654bd668fb633feef3dbe26bae8eeShanqing Cai  channel_ = ::grpc::CreateCustomChannel(
72859ae0c0f9ac654bd668fb633feef3dbe26bae8eeShanqing Cai      server_stream_addr_, ::grpc::InsecureChannelCredentials(), args);
7298b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  if (!channel_->WaitForConnected(
7308b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai          gpr_time_add(gpr_now(GPR_CLOCK_REALTIME),
7318b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai                       gpr_time_from_micros(timeout_micros, GPR_TIMESPAN)))) {
7328b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai    return errors::FailedPrecondition(
7338b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai        "Failed to connect to gRPC channel at ", server_stream_addr_,
7348b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai        " within a timeout of ", timeout_micros / 1e6, " s.");
7358b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  }
7368b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  stub_ = EventListener::NewStub(channel_);
7378b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  reader_writer_ = stub_->SendEvents(&ctx_);
738258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai
7398b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  return Status::OK();
740ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
741ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
742ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Caibool DebugGrpcChannel::WriteEvent(const Event& event) {
743ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  mutex_lock l(mu_);
744ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
745ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  return reader_writer_->Write(event);
746ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
747ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
7483c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugGrpcChannel::ReadEventReply(EventReply* event_reply) {
7493c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  mutex_lock l(mu_);
7503c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai
7513c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  return reader_writer_->Read(event_reply);
7523c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai}
7533c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai
754ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing CaiStatus DebugGrpcChannel::ReceiveServerRepliesAndClose() {
755ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  mutex_lock l(mu_);
756ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
757ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  reader_writer_->WritesDone();
758ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
759ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  // Read all EventReply messages (if any) from the server.
760ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  EventReply event_reply;
761ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  while (reader_writer_->Read(&event_reply)) {
762ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    for (const EventReply::DebugOpStateChange& debug_op_state_change :
763ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai         event_reply.debug_op_state_changes()) {
764ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai      string watch_key = strings::StrCat(debug_op_state_change.node_name(), ":",
765ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                         debug_op_state_change.output_slot(),
766ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                         ":", debug_op_state_change.debug_op());
7673c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      DebugGrpcIO::SetDebugNodeKeyGrpcState(url_, watch_key,
7683c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai                                            debug_op_state_change.state());
769ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    }
770ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
771ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
772ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  if (reader_writer_->Finish().ok()) {
773ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return Status::OK();
774ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  } else {
775ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return Status(error::FAILED_PRECONDITION,
776ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai                  "Failed to close debug GRPC stream.");
777ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
778ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
779ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
780ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai// static
781ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Caimutex DebugGrpcIO::streams_mu;
782ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
7838b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai// static
7848b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Caiint64 DebugGrpcIO::channel_connection_timeout_micros = 900 * 1000 * 1000;
7858b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai// TODO(cais): Make this configurable?
7868b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai
7878b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai// static
7883f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst size_t DebugGrpcIO::kGrpcMessageSizeLimitBytes = 4000 * 1024;
7893f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
7903f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// static
7913f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst size_t DebugGrpcIO::kGrpcMaxVarintLengthSize = 6;
7923f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
7933f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// static
794ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Caistd::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
795ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing CaiDebugGrpcIO::GetStreamChannels() {
796ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  static std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
797ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai      stream_channels =
798ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai          new std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>();
799ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  return stream_channels;
800ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
801ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
802ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static
803ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing CaiStatus DebugGrpcIO::SendTensorThroughGrpcStream(
804258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai    const DebugNodeKey& debug_node_key, const Tensor& tensor,
805258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai    const uint64 wall_time_us, const string& grpc_stream_url,
806258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai    const bool gated) {
8073c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  if (gated &&
8083c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      !IsReadGateOpen(grpc_stream_url, debug_node_key.debug_node_name)) {
809ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    return Status::OK();
810ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  } else {
8113f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    std::vector<Event> events;
8123f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    TF_RETURN_IF_ERROR(WrapTensorAsEvents(debug_node_key, tensor, wall_time_us,
8133f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                          kGrpcMessageSizeLimitBytes, &events));
8143f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    for (const Event& event : events) {
8153f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      TF_RETURN_IF_ERROR(
8163f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          SendEventProtoThroughGrpcStream(event, grpc_stream_url));
8173f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    }
8183c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    if (IsWriteGateOpen(grpc_stream_url, debug_node_key.debug_node_name)) {
8193c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      EventReply event_reply;
8203c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      TF_RETURN_IF_ERROR(ReceiveEventReplyProtoThroughGrpcStream(
8213c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai          &event_reply, grpc_stream_url));
8223c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      // TODO(cais): Support new tensor value carried in the EventReply for
8233c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      // overriding the value of the tensor being published.
8243c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    }
8253c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    return Status::OK();
8263c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  }
8273c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai}
8283c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai
8293c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai// static
8303c482c66b5a1f74875969e96834ff7564e829668Shanqing CaiStatus DebugGrpcIO::ReceiveEventReplyProtoThroughGrpcStream(
8313c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    EventReply* event_reply, const string& grpc_stream_url) {
8323c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  std::shared_ptr<DebugGrpcChannel> debug_grpc_channel;
8333c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  {
8343c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    mutex_lock l(streams_mu);
8353c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
8363c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai        stream_channels = GetStreamChannels();
8373c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    debug_grpc_channel = (*stream_channels)[grpc_stream_url];
8383c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  }
8393c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  if (debug_grpc_channel->ReadEventReply(event_reply)) {
8403f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    return Status::OK();
8413c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  } else {
8423c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    return errors::Cancelled(strings::StrCat(
8433c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai        "Reading EventReply from stream URL ", grpc_stream_url, " failed."));
844ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
8454a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai}
8464a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
8474a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai// static
8484a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing CaiStatus DebugGrpcIO::SendEventProtoThroughGrpcStream(
849aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    const Event& event_proto, const string& grpc_stream_url) {
850aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  const string addr_with_path =
851258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai      grpc_stream_url.find(DebugIO::kGrpcURLScheme) == 0
852258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai          ? grpc_stream_url.substr(strlen(DebugIO::kGrpcURLScheme))
853258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai          : grpc_stream_url;
854aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  const string server_stream_addr =
855aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      addr_with_path.substr(0, addr_with_path.find('/'));
856ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  std::shared_ptr<DebugGrpcChannel> debug_grpc_channel;
857ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  {
858ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    mutex_lock l(streams_mu);
859ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
860ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai        stream_channels = GetStreamChannels();
861ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    if (stream_channels->find(grpc_stream_url) == stream_channels->end()) {
862ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai      debug_grpc_channel.reset(new DebugGrpcChannel(server_stream_addr));
8638b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai      TF_RETURN_IF_ERROR(
8648b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai          debug_grpc_channel->Connect(channel_connection_timeout_micros));
865ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai      (*stream_channels)[grpc_stream_url] = debug_grpc_channel;
866ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    } else {
867ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai      debug_grpc_channel = (*stream_channels)[grpc_stream_url];
868ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    }
869ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
870ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
8714a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  bool write_ok = debug_grpc_channel->WriteEvent(event_proto);
872ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  if (!write_ok) {
873ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return errors::Cancelled(strings::StrCat("Write event to stream URL ",
8748b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai                                             grpc_stream_url, " failed."));
875ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
876ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
877ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  return Status::OK();
878ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
879ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
8803c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugGrpcIO::IsReadGateOpen(const string& grpc_debug_url,
8813c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai                                 const string& watch_key) {
8823c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  const DebugNodeName2State* enabled_node_to_state =
8833c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      GetEnabledDebugOpStatesAtUrl(grpc_debug_url);
8843c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  return enabled_node_to_state->find(watch_key) != enabled_node_to_state->end();
8853c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai}
8863c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai
8873c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugGrpcIO::IsWriteGateOpen(const string& grpc_debug_url,
8883c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai                                  const string& watch_key) {
8893c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  const DebugNodeName2State* enabled_node_to_state =
8903c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      GetEnabledDebugOpStatesAtUrl(grpc_debug_url);
8913c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  auto it = enabled_node_to_state->find(watch_key);
8923c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  if (it == enabled_node_to_state->end()) {
893ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    return false;
894ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  } else {
8953c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    return it->second == EventReply::DebugOpStateChange::READ_WRITE;
896ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
897ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
898ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
899ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static
900aabc7972b94af5a678550427534d4fba7fda327cShanqing CaiStatus DebugGrpcIO::CloseGrpcStream(const string& grpc_stream_url) {
901ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  mutex_lock l(streams_mu);
902ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
903ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
904ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai      stream_channels = GetStreamChannels();
905ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  if (stream_channels->find(grpc_stream_url) != stream_channels->end()) {
906ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    // Stream of the specified address exists. Close it and remove it from
907ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    // record.
908ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    Status s;
909ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    s = (*stream_channels)[grpc_stream_url]->ReceiveServerRepliesAndClose();
910ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    (*stream_channels).erase(grpc_stream_url);
911ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return s;
912ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  } else {
913ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    // Stream of the specified address does not exist. No action.
914ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return Status::OK();
915ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
916ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
917ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
918ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static
9193c482c66b5a1f74875969e96834ff7564e829668Shanqing Caistd::unordered_map<string, DebugGrpcIO::DebugNodeName2State>*
9203c482c66b5a1f74875969e96834ff7564e829668Shanqing CaiDebugGrpcIO::GetEnabledDebugOpStates() {
9213c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  static std::unordered_map<string, DebugNodeName2State>*
9223c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      enabled_debug_op_states =
9233c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai          new std::unordered_map<string, DebugNodeName2State>();
9243c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  return enabled_debug_op_states;
925ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
926ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
927ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static
9283c482c66b5a1f74875969e96834ff7564e829668Shanqing CaiDebugGrpcIO::DebugNodeName2State* DebugGrpcIO::GetEnabledDebugOpStatesAtUrl(
9293c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    const string& grpc_debug_url) {
930c1f69be22e151e2d051f41fccf436767eee4a26aShanqing Cai  static mutex* debug_ops_state_mu = new mutex();
9313c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  std::unordered_map<string, DebugNodeName2State>* states =
9323c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      GetEnabledDebugOpStates();
933c1f69be22e151e2d051f41fccf436767eee4a26aShanqing Cai
934c1f69be22e151e2d051f41fccf436767eee4a26aShanqing Cai  mutex_lock l(*debug_ops_state_mu);
9353c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  if (states->find(grpc_debug_url) == states->end()) {
9363c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    DebugNodeName2State url_enabled_debug_op_states;
9373c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    (*states)[grpc_debug_url] = url_enabled_debug_op_states;
938ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
9393c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  return &(*states)[grpc_debug_url];
940ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
941ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
942ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static
9433c482c66b5a1f74875969e96834ff7564e829668Shanqing Caivoid DebugGrpcIO::SetDebugNodeKeyGrpcState(
9443c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    const string& grpc_debug_url, const string& watch_key,
9453c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    const EventReply::DebugOpStateChange::State new_state) {
9463c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  DebugNodeName2State* states = GetEnabledDebugOpStatesAtUrl(grpc_debug_url);
9473c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  if (new_state == EventReply::DebugOpStateChange::DISABLED) {
9483c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    if (states->find(watch_key) == states->end()) {
9493c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      LOG(ERROR) << "Attempt to disable a watch key that is not currently "
9503c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai                 << "enabled at " << grpc_debug_url << ": " << watch_key;
951ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    } else {
9523c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      states->erase(watch_key);
953ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    }
9543c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  } else if (new_state != EventReply::DebugOpStateChange::STATE_UNSPECIFIED) {
9553c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    (*states)[watch_key] = new_state;
956ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
957ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
958ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
959ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai// static
9603c482c66b5a1f74875969e96834ff7564e829668Shanqing Caivoid DebugGrpcIO::ClearEnabledWatchKeys() {
9613c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  GetEnabledDebugOpStates()->clear();
962ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
963ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
96441803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#endif  // #ifndef PLATFORM_WINDOWS
965ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
9669ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}  // namespace tensorflow
967