19ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
29ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
39ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiLicensed under the Apache License, Version 2.0 (the "License");
49ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caiyou may not use this file except in compliance with the License.
59ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiYou may obtain a copy of the License at
69ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
79ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    http://www.apache.org/licenses/LICENSE-2.0
89ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
99ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiUnless required by applicable law or agreed to in writing, software
109ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caidistributed under the License is distributed on an "AS IS" BASIS,
119ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
129ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiSee the License for the specific language governing permissions and
139ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cailimitations under the License.
149ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai==============================================================================*/
159ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
169ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/debug/debug_io_utils.h"
179ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
183c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <stddef.h>
193c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <string.h>
203c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <cmath>
213c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <limits>
223c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai#include <utility>
239ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include <vector>
249ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
2541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
26ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai#include "grpc++/create_channel.h"
2741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#else
281cb96893a64f59b7265f9def9968f7bed1e57662Andrew Harp// winsock2.h is used in grpc, so Ws2_32.lib is needed
29c902ec6bf03aa1612c81ae65beb4cc3eef190ff4A. Unique TensorFlower#pragma comment(lib, "Ws2_32.lib")
3041803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#endif  // #ifndef PLATFORM_WINDOWS
311cb96893a64f59b7265f9def9968f7bed1e57662Andrew Harp
325ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower#include "tensorflow/core/debug/debug_callback_registry.h"
331e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower#include "tensorflow/core/debug/debugger_event_metadata.pb.h"
34e85d3df92deb9d717befdf173966a2913ac2aea0Geoffrey Irving#include "tensorflow/core/framework/graph.pb.h"
359ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/framework/summary.pb.h"
36c902ec6bf03aa1612c81ae65beb4cc3eef190ff4A. Unique TensorFlower#include "tensorflow/core/framework/tensor_shape.pb.h"
373f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#include "tensorflow/core/lib/core/bits.h"
3812ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai#include "tensorflow/core/lib/hash/hash.h"
399ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/lib/io/path.h"
409ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/lib/strings/str_util.h"
419142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai#include "tensorflow/core/lib/strings/stringprintf.h"
421e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower#include "tensorflow/core/platform/protobuf.h"
439ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai#include "tensorflow/core/util/event.pb.h"
449ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
4541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#define GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR \
4641803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai  return errors::Unimplemented(              \
4741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai      kGrpcURLScheme, " debug URL scheme is not implemented on Windows yet.")
48d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai
499ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cainamespace tensorflow {
509ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
519ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cainamespace {
529ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
533f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Creates an Event proto representing a chunk of a Tensor. This method only
543f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// populates the field of the Event proto that represent the envelope
553f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// informaion (e.g., timestmap, device_name, num_chunks, chunk_index, dtype,
563f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// shape). It does not set the value.tensor field, which should be set by the
573f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// caller separately.
583f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing CaiEvent PrepareChunkEventProto(const DebugNodeKey& debug_node_key,
593f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                             const uint64 wall_time_us, const size_t num_chunks,
603f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                             const size_t chunk_index,
613f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                             const DataType& tensor_dtype,
623f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                             const TensorShapeProto& tensor_shape) {
639ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  Event event;
649ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  event.set_wall_time(static_cast<double>(wall_time_us));
653f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  Summary::Value* value = event.mutable_summary()->add_value();
669ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
679ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  // Create the debug node_name in the Summary proto.
689ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  // For example, if tensor_name = "foo/node_a:0", and the debug_op is
699ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  // "DebugIdentity", the debug node_name in the Summary proto will be
709ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  // "foo/node_a:0:DebugIdentity".
713f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  value->set_node_name(debug_node_key.debug_node_name);
721e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower
733f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  // Tag by the node name. This allows TensorBoard to quickly fetch data
743f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  // per op.
753f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  value->set_tag(debug_node_key.node_name);
761e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower
771e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  // Store data within debugger metadata to be stored for each event.
781e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  third_party::tensorflow::core::debug::DebuggerEventMetadata metadata;
791e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  metadata.set_device(debug_node_key.device_name);
801e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  metadata.set_output_slot(debug_node_key.output_slot);
813f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  metadata.set_num_chunks(num_chunks);
823f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  metadata.set_chunk_index(chunk_index);
831e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower
841e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  // Encode the data in JSON.
851e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  string json_output;
861e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  tensorflow::protobuf::util::JsonPrintOptions json_options;
871e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  json_options.always_print_primitive_fields = true;
881e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  auto status = tensorflow::protobuf::util::MessageToJsonString(
891e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower      metadata, &json_output, json_options);
901e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  if (status.ok()) {
911e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower    // Store summary metadata. Set the plugin to use this data as "debugger".
921e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower    SummaryMetadata::PluginData* plugin_data =
934c60c96257bfd54a036d15af979e90fc0b4e400dA. Unique TensorFlower        value->mutable_metadata()->mutable_plugin_data();
943f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    plugin_data->set_plugin_name(DebugIO::kDebuggerPluginName);
951e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower    plugin_data->set_content(json_output);
961e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  } else {
971e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower    LOG(WARNING) << "Failed to convert DebuggerEventMetadata proto to JSON. "
981e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower                 << "The debug_node_name is " << debug_node_key.debug_node_name
991e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower                 << ".";
1001e8d1fe6c3c1221f16c05dd1883f893f291c94d2A. Unique TensorFlower  }
1019ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
1023f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  value->mutable_tensor()->set_dtype(tensor_dtype);
1033f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  *value->mutable_tensor()->mutable_tensor_shape() = tensor_shape;
1043f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1053f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  return event;
1063f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai}
1073f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1083f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Translates the length of a string to number of bytes when the string is
1093f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// encoded as bytes in protobuf. Note that this makes a conservative estimate
1103f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// (i.e., an estimate that is usually too large, but never too small under the
1113f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// gRPC message size limit) of the Varint-encoded length, to workaround the lack
1123f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// of a portable length function.
1133f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst size_t StringValMaxBytesInProto(const string& str) {
1143f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#if defined(PLATFORM_GOOGLE)
1153f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  return str.size() + DebugGrpcIO::kGrpcMaxVarintLengthSize;
1163f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#else
1173f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  return str.size();
1183f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai#endif
1193f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai}
1203f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1213f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Breaks a string Tensor (represented as a TensorProto) as a vector of Event
1223f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// protos.
1233f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing CaiStatus WrapStringTensorAsEvents(const DebugNodeKey& debug_node_key,
1243f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                const uint64 wall_time_us,
1253f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                const size_t chunk_size_limit,
1263f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                TensorProto* tensor_proto,
1273f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                std::vector<Event>* events) {
1283f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  const protobuf::RepeatedPtrField<string>& strs = tensor_proto->string_val();
1293f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  const size_t num_strs = strs.size();
1303f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  const size_t chunk_size_ub = chunk_size_limit > 0
1313f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                   ? chunk_size_limit
1323f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                   : std::numeric_limits<size_t>::max();
1333f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1343f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  // E.g., if cutoffs is {j, k, l}, the chunks will have index ranges:
1353f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  //   [0:a), [a:b), [c:<end>].
1363f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  std::vector<size_t> cutoffs;
1373f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  size_t chunk_size = 0;
1383f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  for (size_t i = 0; i < num_strs; ++i) {
1393f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    // Take into account the extra bytes in proto buffer.
1403f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    if (StringValMaxBytesInProto(strs[i]) > chunk_size_ub) {
1413f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      return errors::FailedPrecondition(
1423f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          "string value at index ", i, " from debug node ",
1433f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          debug_node_key.debug_node_name,
1443f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          " does not fit gRPC message size limit (", chunk_size_ub, ")");
1453f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    }
1463f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    if (chunk_size + StringValMaxBytesInProto(strs[i]) > chunk_size_ub) {
1473f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      cutoffs.push_back(i);
1483f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      chunk_size = 0;
1493f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    }
1503f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    chunk_size += StringValMaxBytesInProto(strs[i]);
1513f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  }
1523f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  cutoffs.push_back(num_strs);
1533f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  const size_t num_chunks = cutoffs.size();
1543f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1553f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  for (size_t i = 0; i < num_chunks; ++i) {
1563f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    Event event = PrepareChunkEventProto(debug_node_key, wall_time_us,
1573f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                         num_chunks, i, tensor_proto->dtype(),
1583f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                         tensor_proto->tensor_shape());
1593f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    Summary::Value* value = event.mutable_summary()->mutable_value(0);
1603f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1613f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    if (cutoffs.size() == 1) {
1623f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      value->mutable_tensor()->mutable_string_val()->Swap(
1633f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          tensor_proto->mutable_string_val());
1643f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    } else {
1653f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      const size_t begin = (i == 0) ? 0 : cutoffs[i - 1];
1663f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      const size_t end = cutoffs[i];
1673f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      for (size_t j = begin; j < end; ++j) {
1683f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai        value->mutable_tensor()->add_string_val(strs[j]);
1693f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      }
1703f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    }
1713f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1723f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    events->push_back(std::move(event));
1733f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  }
1743f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1753f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  return Status::OK();
1763f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai}
1773f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1783f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Encapsulates the tensor value inside a vector of Event protos. Large tensors
1793f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// are broken up to multiple protos to fit the chunk_size_limit. In each Event
1803f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// proto the field summary.tensor carries the content of the tensor.
1813f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// If chunk_size_limit <= 0, the tensor will not be broken into chunks, i.e., a
1823f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// length-1 vector will be returned, regardless of the size of the tensor.
1833f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing CaiStatus WrapTensorAsEvents(const DebugNodeKey& debug_node_key,
1843f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                          const Tensor& tensor, const uint64 wall_time_us,
1853f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                          const size_t chunk_size_limit,
1863f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                          std::vector<Event>* events) {
1873f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  TensorProto tensor_proto;
1889ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  if (tensor.dtype() == DT_STRING) {
1893f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    // Treat DT_STRING specially, so that tensor_util.MakeNdarray in Python can
1903f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    // convert the TensorProto to string-type numpy array. MakeNdarray does not
1913f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    // work with strings encoded by AsProtoTensorContent() in tensor_content.
1923f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    tensor.AsProtoField(&tensor_proto);
1933f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1943f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    TF_RETURN_IF_ERROR(WrapStringTensorAsEvents(
1953f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai        debug_node_key, wall_time_us, chunk_size_limit, &tensor_proto, events));
1969ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  } else {
1973f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    tensor.AsProtoTensorContent(&tensor_proto);
1983f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
1993f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    const size_t total_length = tensor_proto.tensor_content().size();
2003f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    const size_t chunk_size_ub =
2013f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai        chunk_size_limit > 0 ? chunk_size_limit : total_length;
2023f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    const size_t num_chunks =
2033f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai        (total_length == 0)
2043f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai            ? 1
2053f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai            : (total_length + chunk_size_ub - 1) / chunk_size_ub;
2063f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    for (size_t i = 0; i < num_chunks; ++i) {
2073f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      const size_t pos = i * chunk_size_ub;
2083f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      const size_t len =
2093f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          (i == num_chunks - 1) ? (total_length - pos) : chunk_size_ub;
2103f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      Event event = PrepareChunkEventProto(debug_node_key, wall_time_us,
2113f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                           num_chunks, i, tensor_proto.dtype(),
2123f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                           tensor_proto.tensor_shape());
2133f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      event.mutable_summary()
2143f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          ->mutable_value(0)
2153f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          ->mutable_tensor()
2163f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          ->set_tensor_content(tensor_proto.tensor_content().substr(pos, len));
2173f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      events->push_back(std::move(event));
2183f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    }
2199ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
2209ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
2213f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  return Status::OK();
2229ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}
2239ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
2243f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Appends an underscore and a timestamp to a file path. If the path already
2259142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// exists on the file system, append a hyphen and a 1-up index. Consecutive
2269142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// values of the index will be tried until the first unused one is found.
2279142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// TOCTOU race condition is not of concern here due to the fact that tfdbg
2289142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// sets parallel_iterations attribute of all while_loops to 1 to prevent
2299142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai// the same node from between executed multiple times concurrently.
2309142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Caistring AppendTimestampToFilePath(const string& in, const uint64 timestamp) {
2319142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai  string out = strings::StrCat(in, "_", timestamp);
2329142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai
2339142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai  uint64 i = 1;
2349142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai  while (Env::Default()->FileExists(out).ok()) {
2359142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai    out = strings::StrCat(in, "_", timestamp, "-", i);
2369142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai    ++i;
2379142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai  }
2389142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai  return out;
2399142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai}
2409142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai
24141803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
2423f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Publishes encoded GraphDef through a gRPC debugger stream, in chunks,
2433f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// conforming to the gRPC message size limit.
24412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing CaiStatus PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
24512ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai                                      const string& device_name,
24612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai                                      const int64 wall_time,
24712ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai                                      const string& debug_url) {
24812ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai  const uint64 hash = ::tensorflow::Hash64(encoded_graph_def);
24912ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai  const size_t total_length = encoded_graph_def.size();
2503f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  const size_t num_chunks =
2513f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      static_cast<size_t>(std::ceil(static_cast<float>(total_length) /
2523f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                    DebugGrpcIO::kGrpcMessageSizeLimitBytes));
25312ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai  for (size_t i = 0; i < num_chunks; ++i) {
2543f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    const size_t pos = i * DebugGrpcIO::kGrpcMessageSizeLimitBytes;
2553f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    const size_t len = (i == num_chunks - 1)
2563f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                           ? (total_length - pos)
2573f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                           : DebugGrpcIO::kGrpcMessageSizeLimitBytes;
25812ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    Event event;
25912ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    event.set_wall_time(static_cast<double>(wall_time));
26012ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    // Prefix the chunk with
26112ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    //   <hash64>,<device_name>,<wall_time>|<index>|<num_chunks>|.
2623f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    // TODO(cais): Use DebuggerEventMetadata to store device_name, num_chunks
2633f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    // and chunk_index, instead.
26412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    event.set_graph_def(strings::StrCat(hash, ",", device_name, ",", wall_time,
26512ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai                                        "|", i, "|", num_chunks, "|",
26612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai                                        encoded_graph_def.substr(pos, len)));
267ecae82d1343df293fa36e67949e5404111817110Shanqing Cai    const Status s = DebugGrpcIO::SendEventProtoThroughGrpcStream(
268ecae82d1343df293fa36e67949e5404111817110Shanqing Cai        event, debug_url, num_chunks - 1 == i);
269ecae82d1343df293fa36e67949e5404111817110Shanqing Cai    if (!s.ok()) {
27012ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai      return errors::FailedPrecondition(
27112ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai          "Failed to send chunk ", i, " of ", num_chunks,
272ecae82d1343df293fa36e67949e5404111817110Shanqing Cai          " of encoded GraphDef of size ", encoded_graph_def.size(), " bytes, ",
273ecae82d1343df293fa36e67949e5404111817110Shanqing Cai          "due to: ", s.error_message());
27412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai    }
27512ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai  }
27612ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai  return Status::OK();
27712ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai}
27841803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#endif  // #ifndef PLATFORM_WINDOWS
27912ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai
2809ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}  // namespace
2819ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
2823f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst char* const DebugIO::kDebuggerPluginName = "debugger";
2833f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
284cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst char* const DebugIO::kCoreMetadataTag = "core_metadata_";
285cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai
286cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Caiconst char* const DebugIO::kGraphTag = "graph_";
287cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai
288525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Caiconst char* const DebugIO::kHashTag = "hash";
289525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai
290ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing CaiStatus ReadEventFromFile(const string& dump_file_path, Event* event) {
291ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  Env* env(Env::Default());
292ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
293ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  string content;
294ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  uint64 file_size = 0;
295ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
296ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  Status s = env->GetFileSize(dump_file_path, &file_size);
297ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  if (!s.ok()) {
298ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return s;
299ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
300ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
301ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  content.resize(file_size);
302ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
303ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  std::unique_ptr<RandomAccessFile> file;
304ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  s = env->NewRandomAccessFile(dump_file_path, &file);
305ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  if (!s.ok()) {
306ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return s;
307ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
308ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
309ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  StringPiece result;
310ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  s = file->Read(0, file_size, &result, &(content)[0]);
311ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  if (!s.ok()) {
312ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return s;
313ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
314ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
315ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  event->ParseFromString(content);
316ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  return Status::OK();
317ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
318ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
3199ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caiconst char* const DebugIO::kFileURLScheme = "file://";
3209ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caiconst char* const DebugIO::kGrpcURLScheme = "grpc://";
3215ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlowerconst char* const DebugIO::kMemoryURLScheme = "memcbk://";
3229ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
3233f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai// Publishes debug metadata to a set of debug URLs.
324aabc7972b94af5a678550427534d4fba7fda327cShanqing CaiStatus DebugIO::PublishDebugMetadata(
325258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai    const int64 global_step, const int64 session_run_index,
326258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai    const int64 executor_step_index, const std::vector<string>& input_names,
327aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    const std::vector<string>& output_names,
328aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    const std::vector<string>& target_nodes,
329aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    const std::unordered_set<string>& debug_urls) {
330aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  std::ostringstream oss;
331aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
332aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  // Construct a JSON string to carry the metadata.
333aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "{";
334aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "\"global_step\":" << global_step << ",";
335258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai  oss << "\"session_run_index\":" << session_run_index << ",";
336258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai  oss << "\"executor_step_index\":" << executor_step_index << ",";
337aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "\"input_names\":[";
338aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  for (size_t i = 0; i < input_names.size(); ++i) {
339aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    oss << "\"" << input_names[i] << "\"";
340aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    if (i < input_names.size() - 1) {
341aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      oss << ",";
342aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    }
343aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  }
344aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "],";
345aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "\"output_names\":[";
346aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  for (size_t i = 0; i < output_names.size(); ++i) {
347aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    oss << "\"" << output_names[i] << "\"";
348aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    if (i < output_names.size() - 1) {
349aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      oss << ",";
350aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    }
351aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  }
352aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "],";
353aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "\"target_nodes\":[";
354aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  for (size_t i = 0; i < target_nodes.size(); ++i) {
355aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    oss << "\"" << target_nodes[i] << "\"";
356aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    if (i < target_nodes.size() - 1) {
357aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      oss << ",";
358aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    }
359aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  }
360aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "]";
361aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  oss << "}";
362aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
363aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  const string json_metadata = oss.str();
364aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  Event event;
365aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  event.set_wall_time(static_cast<double>(Env::Default()->NowMicros()));
366aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  LogMessage* log_message = event.mutable_log_message();
367aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  log_message->set_message(json_metadata);
368aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
369aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  Status status;
370aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  for (const string& url : debug_urls) {
371aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) {
37241803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
373aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      Event grpc_event;
374aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
375aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      // Determine the path (if any) in the grpc:// URL, and add it as a field
376aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      // of the JSON string.
377aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      const string address = url.substr(strlen(DebugIO::kFileURLScheme));
378aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      const string path = address.find("/") == string::npos
379aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai                              ? ""
380aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai                              : address.substr(address.find("/"));
381aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      grpc_event.set_wall_time(event.wall_time());
382aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      LogMessage* log_message_grpc = grpc_event.mutable_log_message();
383aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      log_message_grpc->set_message(
384aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai          strings::StrCat(json_metadata.substr(0, json_metadata.size() - 1),
385aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai                          ",\"grpc_path\":\"", path, "\"}"));
386aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
387aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      status.Update(
388ecae82d1343df293fa36e67949e5404111817110Shanqing Cai          DebugGrpcIO::SendEventProtoThroughGrpcStream(grpc_event, url, true));
389d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else
39041803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai      GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
391d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif
392aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    } else if (str_util::Lowercase(url).find(kFileURLScheme) == 0) {
393aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      const string dump_root_dir = url.substr(strlen(kFileURLScheme));
3949142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai      const string core_metadata_path = AppendTimestampToFilePath(
3959142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai          io::JoinPath(
3969142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai              dump_root_dir,
3975ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower              strings::StrCat(DebugNodeKey::kMetadataFilePrefix,
398cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai                              DebugIO::kCoreMetadataTag, "sessionrun",
399258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                              strings::Printf("%.14lld", session_run_index))),
4009142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai          Env::Default()->NowMicros());
4019142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai      status.Update(DebugFileIO::DumpEventProtoToFile(
4029142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai          event, io::Dirname(core_metadata_path).ToString(),
4039142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai          io::Basename(core_metadata_path).ToString()));
404aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    }
405aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  }
406aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
407aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  return status;
408aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai}
409aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai
410258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
411258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                   const Tensor& tensor,
4129ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                                   const uint64 wall_time_us,
413ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                   const gtl::ArraySlice<string>& debug_urls,
414ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                   const bool gated_grpc) {
4158b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  int32 num_failed_urls = 0;
416ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  std::vector<Status> fail_statuses;
4179ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  for (const string& url : debug_urls) {
4189ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    if (str_util::Lowercase(url).find(kFileURLScheme) == 0) {
4199ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai      const string dump_root_dir = url.substr(strlen(kFileURLScheme));
4209ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
421258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai      Status s = DebugFileIO::DumpTensorToDir(
422258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai          debug_node_key, tensor, wall_time_us, dump_root_dir, nullptr);
4239ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai      if (!s.ok()) {
4249ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai        num_failed_urls++;
425ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai        fail_statuses.push_back(s);
4269ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai      }
4279ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    } else if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) {
42841803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
429ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai      Status s = DebugGrpcIO::SendTensorThroughGrpcStream(
430258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai          debug_node_key, tensor, wall_time_us, url, gated_grpc);
431ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
432ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai      if (!s.ok()) {
433ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai        num_failed_urls++;
434ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai        fail_statuses.push_back(s);
435ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai      }
436d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else
43741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai      GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
438d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif
4395ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower    } else if (str_util::Lowercase(url).find(kMemoryURLScheme) == 0) {
4405ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower      const string dump_root_dir = url.substr(strlen(kMemoryURLScheme));
4415ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower      auto* callback_registry = DebugCallbackRegistry::singleton();
4425ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower      auto* callback = callback_registry->GetCallback(dump_root_dir);
4435ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower      CHECK(callback) << "No callback registered for: " << dump_root_dir;
4445ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower      (*callback)(debug_node_key, tensor);
4459ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    } else {
4469ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai      return Status(error::UNAVAILABLE,
4479ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                    strings::StrCat("Invalid debug target URL: ", url));
4489ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    }
4499ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
4509ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
4519ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  if (num_failed_urls == 0) {
4529ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    return Status::OK();
4539ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  } else {
454ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    string error_message = strings::StrCat(
455ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai        "Publishing to ", num_failed_urls, " of ", debug_urls.size(),
456ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai        " debug target URLs failed, due to the following errors:");
457ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    for (Status& status : fail_statuses) {
458ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai      error_message =
459ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai          strings::StrCat(error_message, " ", status.error_message(), ";");
460ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    }
461ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
462ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return Status(error::INTERNAL, error_message);
4639ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
4649ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}
4659ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
466258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
467258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                   const Tensor& tensor,
468ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                   const uint64 wall_time_us,
469ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                   const gtl::ArraySlice<string>& debug_urls) {
470258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai  return PublishDebugTensor(debug_node_key, tensor, wall_time_us, debug_urls,
471258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                            false);
472ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
473ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
47412ac2f34fadc8802121382c64588d9f9c2f58390Shanqing CaiStatus DebugIO::PublishGraph(const Graph& graph, const string& device_name,
4754a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai                             const std::unordered_set<string>& debug_urls) {
4764a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  GraphDef graph_def;
4774a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  graph.ToGraphDef(&graph_def);
4784a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
4794a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  string buf;
4804a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  graph_def.SerializeToString(&buf);
4814a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
4824a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  const int64 now_micros = Env::Default()->NowMicros();
4834a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  Event event;
4844a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  event.set_wall_time(static_cast<double>(now_micros));
4854a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  event.set_graph_def(buf);
4864a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
4874a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  Status status = Status::OK();
4884a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  for (const string& debug_url : debug_urls) {
4894a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai    if (debug_url.find(kFileURLScheme) == 0) {
490cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai      const string dump_root_dir =
491cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai          io::JoinPath(debug_url.substr(strlen(kFileURLScheme)),
492cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai                       DebugNodeKey::DeviceNameToDevicePath(device_name));
493525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai      const uint64 graph_hash = ::tensorflow::Hash64(buf);
494525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai      const string file_name =
4955ce3523bcc844217b47e7f862c1bed894cbaa34eA. Unique TensorFlower          strings::StrCat(DebugNodeKey::kMetadataFilePrefix, DebugIO::kGraphTag,
496525b0f05839779e40d0ca9cc2967a3886b6a0f4dShanqing Cai                          DebugIO::kHashTag, graph_hash, "_", now_micros);
4974a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
4984a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai      status.Update(
4994a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai          DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name));
5004a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai    } else if (debug_url.find(kGrpcURLScheme) == 0) {
50141803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
50212ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai      status.Update(PublishEncodedGraphDefInChunks(buf, device_name, now_micros,
50312ac2f34fadc8802121382c64588d9f9c2f58390Shanqing Cai                                                   debug_url));
504d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else
50541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai      GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
506d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif
5074a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai    }
5084a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  }
5094a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
5104a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  return status;
5114a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai}
5124a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
513617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Caibool DebugIO::IsCopyNodeGateOpen(
514617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai    const std::vector<DebugWatchAndURLSpec>& specs) {
51541803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
516617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai  for (const DebugWatchAndURLSpec& spec : specs) {
517617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai    if (!spec.gated_grpc || spec.url.compare(0, strlen(DebugIO::kGrpcURLScheme),
518617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai                                             DebugIO::kGrpcURLScheme)) {
519617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai      return true;
520617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai    } else {
5213c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      if (DebugGrpcIO::IsReadGateOpen(spec.url, spec.watch_key)) {
522617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai        return true;
523617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai      }
524617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai    }
525617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai  }
526617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai  return false;
527617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai#else
528617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai  return true;
529617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai#endif
530617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai}
531617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai
532ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Caibool DebugIO::IsDebugNodeGateOpen(const string& watch_key,
533ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                  const std::vector<string>& debug_urls) {
53441803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
535ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  for (const string& debug_url : debug_urls) {
536617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai    if (debug_url.compare(0, strlen(DebugIO::kGrpcURLScheme),
537617ec9b47e70b4d9bd673642a559ac5652332df1Shanqing Cai                          DebugIO::kGrpcURLScheme)) {
538ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai      return true;
539ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    } else {
5403c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      if (DebugGrpcIO::IsReadGateOpen(debug_url, watch_key)) {
541ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai        return true;
542ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai      }
543ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    }
544ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
545ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  return false;
546ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#else
547ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  return true;
548ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#endif
549ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
550ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
551ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Caibool DebugIO::IsDebugURLGateOpen(const string& watch_key,
552ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                 const string& debug_url) {
55341803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
554ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  if (debug_url.find(kGrpcURLScheme) != 0) {
555ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    return true;
556ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  } else {
5573c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    return DebugGrpcIO::IsReadGateOpen(debug_url, watch_key);
558ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
559ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#else
560ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  return true;
561ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai#endif
562ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
563ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
564ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing CaiStatus DebugIO::CloseDebugURL(const string& debug_url) {
565ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  if (debug_url.find(DebugIO::kGrpcURLScheme) == 0) {
56641803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
567aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai    return DebugGrpcIO::CloseGrpcStream(debug_url);
568d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#else
56941803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai    GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
570d82e42ca3a0b06d768a439d84ff74741de7e092dShanqing Cai#endif
571ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  } else {
572ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    // No-op for non-gRPC URLs.
573ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return Status::OK();
574ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
575ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
576ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
577258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key,
578258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                    const Tensor& tensor,
579258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                    const uint64 wall_time_us,
580258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                    const string& dump_root_dir,
581258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                    string* dump_file_path) {
582258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai  const string file_path =
583258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai      GetDumpFilePath(dump_root_dir, debug_node_key, wall_time_us);
5849ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
5859ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  if (dump_file_path != nullptr) {
5869ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    *dump_file_path = file_path;
5879ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
5889ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
589258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai  return DumpTensorToEventFile(debug_node_key, tensor, wall_time_us, file_path);
5909ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}
5919ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
5929ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Caistring DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
593258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                    const DebugNodeKey& debug_node_key,
5949ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                                    const uint64 wall_time_us) {
5959142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai  return AppendTimestampToFilePath(
596cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844Shanqing Cai      io::JoinPath(dump_root_dir, debug_node_key.device_path,
597258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                   strings::StrCat(debug_node_key.node_name, "_",
598258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                   debug_node_key.output_slot, "_",
599258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                   debug_node_key.debug_op)),
6009142d6164b1be58b2e2adf6ba7be6e46a942ff5aShanqing Cai      wall_time_us);
6019ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}
6029ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6034a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing CaiStatus DebugFileIO::DumpEventProtoToFile(const Event& event_proto,
6044a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai                                         const string& dir_name,
6054a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai                                         const string& file_name) {
6069ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  Env* env(Env::Default());
6079ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6084a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  Status s = RecursiveCreateDir(env, dir_name);
6099ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  if (!s.ok()) {
6109ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    return Status(error::FAILED_PRECONDITION,
6114a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai                  strings::StrCat("Failed to create directory  ", dir_name,
6129ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                                  ", due to: ", s.error_message()));
6139ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
6149ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6154a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  const string file_path = io::JoinPath(dir_name, file_name);
6169ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6179ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  string event_str;
6184a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  event_proto.SerializeToString(&event_str);
6199ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6209ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  std::unique_ptr<WritableFile> f = nullptr;
6219ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  TF_CHECK_OK(env->NewWritableFile(file_path, &f));
622bc225bfaa534acc25047fe844f19edc333b7a76aPeter Hawkins  f->Append(event_str).IgnoreError();
6239ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  TF_CHECK_OK(f->Close());
6249ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6259ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  return Status::OK();
6269ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}
6279ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
628258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing CaiStatus DebugFileIO::DumpTensorToEventFile(const DebugNodeKey& debug_node_key,
629258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                          const Tensor& tensor,
630258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                          const uint64 wall_time_us,
631258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai                                          const string& file_path) {
6323f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  std::vector<Event> events;
6333f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  TF_RETURN_IF_ERROR(
6343f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      WrapTensorAsEvents(debug_node_key, tensor, wall_time_us, 0, &events));
6353f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai  return DumpEventProtoToFile(events[0], io::Dirname(file_path).ToString(),
6363f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                              io::Basename(file_path).ToString());
6374a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai}
6384a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
6399ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing CaiStatus DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
640879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu  if (env->FileExists(dir).ok() && env->IsDirectory(dir).ok()) {
6419ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    // The path already exists as a directory. Return OK right away.
6429ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    return Status::OK();
6439ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
6449ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
6459ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  string parent_dir = io::Dirname(dir).ToString();
646879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu  if (!env->FileExists(parent_dir).ok()) {
6479ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    // The parent path does not exist yet, create it first.
6489ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    Status s = RecursiveCreateDir(env, parent_dir);  // Recursive call
6499ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    if (!s.ok()) {
6509ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai      return Status(
6519ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai          error::FAILED_PRECONDITION,
6529ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai          strings::StrCat("Failed to create directory  ", parent_dir));
6539ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    }
654879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu  } else if (env->FileExists(parent_dir).ok() &&
6559ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai             !env->IsDirectory(parent_dir).ok()) {
6569ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    // The path exists, but it is a file.
6579ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    return Status(error::FAILED_PRECONDITION,
6589ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                  strings::StrCat("Failed to create directory  ", parent_dir,
6599ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                                  " because the path exists as a file "));
6609ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
6619ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
662bc225bfaa534acc25047fe844f19edc333b7a76aPeter Hawkins  env->CreateDir(dir).IgnoreError();
6639ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  // Guard against potential race in creating directories by doing a check
6649ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  // after the CreateDir call.
665879e0accd1c833771c8058d3eb5f2d4f06f895d4Jonathan Hseu  if (env->FileExists(dir).ok() && env->IsDirectory(dir).ok()) {
6669ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    return Status::OK();
6679ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  } else {
6689ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai    return Status(error::ABORTED,
6699ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai                  strings::StrCat("Failed to create directory  ", parent_dir));
6709ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai  }
6719ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}
6729ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai
67341803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#ifndef PLATFORM_WINDOWS
674ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing CaiDebugGrpcChannel::DebugGrpcChannel(const string& server_stream_addr)
6758b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai    : server_stream_addr_(server_stream_addr),
6768b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai      url_(strings::StrCat(DebugIO::kGrpcURLScheme, server_stream_addr)) {}
6778b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai
6788b219918214f779b0f4c7785ae93feffa6e492c3Shanqing CaiStatus DebugGrpcChannel::Connect(const int64 timeout_micros) {
6798b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  ::grpc::ChannelArguments args;
6808b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
6818b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  // Avoid problems where default reconnect backoff is too long (e.g., 20 s).
6828b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
68359ae0c0f9ac654bd668fb633feef3dbe26bae8eeShanqing Cai  channel_ = ::grpc::CreateCustomChannel(
68459ae0c0f9ac654bd668fb633feef3dbe26bae8eeShanqing Cai      server_stream_addr_, ::grpc::InsecureChannelCredentials(), args);
6858b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  if (!channel_->WaitForConnected(
6868b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai          gpr_time_add(gpr_now(GPR_CLOCK_REALTIME),
6878b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai                       gpr_time_from_micros(timeout_micros, GPR_TIMESPAN)))) {
6888b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai    return errors::FailedPrecondition(
6898b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai        "Failed to connect to gRPC channel at ", server_stream_addr_,
6908b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai        " within a timeout of ", timeout_micros / 1e6, " s.");
6918b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  }
6928b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  stub_ = EventListener::NewStub(channel_);
6938b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  reader_writer_ = stub_->SendEvents(&ctx_);
694258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai
6958b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai  return Status::OK();
696ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
697ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
698ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Caibool DebugGrpcChannel::WriteEvent(const Event& event) {
699ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  mutex_lock l(mu_);
700ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  return reader_writer_->Write(event);
701ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
702ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
7033c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugGrpcChannel::ReadEventReply(EventReply* event_reply) {
704639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai  mutex_lock l(mu_);
7053c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  return reader_writer_->Read(event_reply);
7063c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai}
7073c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai
708ecae82d1343df293fa36e67949e5404111817110Shanqing Caivoid DebugGrpcChannel::ReceiveAndProcessEventReplies(const size_t max_replies) {
709ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  EventReply event_reply;
710ecae82d1343df293fa36e67949e5404111817110Shanqing Cai  size_t num_replies = 0;
711ecae82d1343df293fa36e67949e5404111817110Shanqing Cai  while ((max_replies == 0 || ++num_replies <= max_replies) &&
712ecae82d1343df293fa36e67949e5404111817110Shanqing Cai         ReadEventReply(&event_reply)) {
713ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    for (const EventReply::DebugOpStateChange& debug_op_state_change :
714ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai         event_reply.debug_op_state_changes()) {
715ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai      string watch_key = strings::StrCat(debug_op_state_change.node_name(), ":",
716ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                         debug_op_state_change.output_slot(),
717ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai                                         ":", debug_op_state_change.debug_op());
7183c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      DebugGrpcIO::SetDebugNodeKeyGrpcState(url_, watch_key,
7193c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai                                            debug_op_state_change.state());
720ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    }
721ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
722ecae82d1343df293fa36e67949e5404111817110Shanqing Cai}
723ecae82d1343df293fa36e67949e5404111817110Shanqing Cai
724ecae82d1343df293fa36e67949e5404111817110Shanqing CaiStatus DebugGrpcChannel::ReceiveServerRepliesAndClose() {
725ecae82d1343df293fa36e67949e5404111817110Shanqing Cai  reader_writer_->WritesDone();
726ecae82d1343df293fa36e67949e5404111817110Shanqing Cai  // Read all EventReply messages (if any) from the server.
727ecae82d1343df293fa36e67949e5404111817110Shanqing Cai  ReceiveAndProcessEventReplies(0);
728ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
729ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  if (reader_writer_->Finish().ok()) {
730ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return Status::OK();
731ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  } else {
732ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return Status(error::FAILED_PRECONDITION,
733ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai                  "Failed to close debug GRPC stream.");
734ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
735ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
736ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
737f93c48dc061d23495a4425fcad17d55159cb02b1A. Unique TensorFlowermutex DebugGrpcIO::streams_mu(LINKER_INITIALIZED);
738ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
7398b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Caiint64 DebugGrpcIO::channel_connection_timeout_micros = 900 * 1000 * 1000;
7408b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai// TODO(cais): Make this configurable?
7418b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai
7423f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst size_t DebugGrpcIO::kGrpcMessageSizeLimitBytes = 4000 * 1024;
7433f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
7443f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Caiconst size_t DebugGrpcIO::kGrpcMaxVarintLengthSize = 6;
7453f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai
746639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Caistd::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
747ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing CaiDebugGrpcIO::GetStreamChannels() {
748639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai  static std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
749ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai      stream_channels =
750639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai          new std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>();
751ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  return stream_channels;
752ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
753ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
754ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing CaiStatus DebugGrpcIO::SendTensorThroughGrpcStream(
755258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai    const DebugNodeKey& debug_node_key, const Tensor& tensor,
756258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai    const uint64 wall_time_us, const string& grpc_stream_url,
757258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai    const bool gated) {
7583c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  if (gated &&
7593c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      !IsReadGateOpen(grpc_stream_url, debug_node_key.debug_node_name)) {
760ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    return Status::OK();
761ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  } else {
7623f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    std::vector<Event> events;
7633f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    TF_RETURN_IF_ERROR(WrapTensorAsEvents(debug_node_key, tensor, wall_time_us,
7643f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai                                          kGrpcMessageSizeLimitBytes, &events));
7653f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    for (const Event& event : events) {
7663f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai      TF_RETURN_IF_ERROR(
7673f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai          SendEventProtoThroughGrpcStream(event, grpc_stream_url));
7683f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    }
7693c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    if (IsWriteGateOpen(grpc_stream_url, debug_node_key.debug_node_name)) {
770639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai      DebugGrpcChannel* debug_grpc_channel = nullptr;
771639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai      TF_RETURN_IF_ERROR(
772639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai          GetOrCreateDebugGrpcChannel(grpc_stream_url, &debug_grpc_channel));
773639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai      debug_grpc_channel->ReceiveAndProcessEventReplies(1);
7743c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      // TODO(cais): Support new tensor value carried in the EventReply for
7753c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      // overriding the value of the tensor being published.
7763c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    }
7773c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    return Status::OK();
7783c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  }
7793c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai}
7803c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai
7813c482c66b5a1f74875969e96834ff7564e829668Shanqing CaiStatus DebugGrpcIO::ReceiveEventReplyProtoThroughGrpcStream(
7823c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    EventReply* event_reply, const string& grpc_stream_url) {
783639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai  DebugGrpcChannel* debug_grpc_channel = nullptr;
784639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai  TF_RETURN_IF_ERROR(
785639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai      GetOrCreateDebugGrpcChannel(grpc_stream_url, &debug_grpc_channel));
7863c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  if (debug_grpc_channel->ReadEventReply(event_reply)) {
7873f099e7d7f1c8b8d3f4121357fc7c40391f9eafeShanqing Cai    return Status::OK();
7883c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  } else {
7893c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    return errors::Cancelled(strings::StrCat(
7903c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai        "Reading EventReply from stream URL ", grpc_stream_url, " failed."));
791ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
7924a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai}
7934a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai
794639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing CaiStatus DebugGrpcIO::GetOrCreateDebugGrpcChannel(
795639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai    const string& grpc_stream_url, DebugGrpcChannel** debug_grpc_channel) {
796aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  const string addr_with_path =
797258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai      grpc_stream_url.find(DebugIO::kGrpcURLScheme) == 0
798258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai          ? grpc_stream_url.substr(strlen(DebugIO::kGrpcURLScheme))
799258b5073203573fd9a33fd9f0b133289fb22a2d0Shanqing Cai          : grpc_stream_url;
800aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai  const string server_stream_addr =
801aabc7972b94af5a678550427534d4fba7fda327cShanqing Cai      addr_with_path.substr(0, addr_with_path.find('/'));
802ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  {
803ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    mutex_lock l(streams_mu);
804639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai    std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
805ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai        stream_channels = GetStreamChannels();
806ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    if (stream_channels->find(grpc_stream_url) == stream_channels->end()) {
807639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai      std::unique_ptr<DebugGrpcChannel> channel(
808639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai          new DebugGrpcChannel(server_stream_addr));
809639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai      TF_RETURN_IF_ERROR(channel->Connect(channel_connection_timeout_micros));
810639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai      stream_channels->insert(
811639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai          std::make_pair(grpc_stream_url, std::move(channel)));
812ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    }
813639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai    *debug_grpc_channel = (*stream_channels)[grpc_stream_url].get();
814ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
815639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai  return Status::OK();
816639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai}
817639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai
818639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing CaiStatus DebugGrpcIO::SendEventProtoThroughGrpcStream(
819639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai    const Event& event_proto, const string& grpc_stream_url,
820639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai    const bool receive_reply) {
821639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai  DebugGrpcChannel* debug_grpc_channel;
822639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai  TF_RETURN_IF_ERROR(
823639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai      GetOrCreateDebugGrpcChannel(grpc_stream_url, &debug_grpc_channel));
824ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
8254a3e9868f84e38e3da235c0d6090e7831f34a143Shanqing Cai  bool write_ok = debug_grpc_channel->WriteEvent(event_proto);
826ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  if (!write_ok) {
827ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return errors::Cancelled(strings::StrCat("Write event to stream URL ",
8288b219918214f779b0f4c7785ae93feffa6e492c3Shanqing Cai                                             grpc_stream_url, " failed."));
829ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
830ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
831ecae82d1343df293fa36e67949e5404111817110Shanqing Cai  if (receive_reply) {
832ecae82d1343df293fa36e67949e5404111817110Shanqing Cai    debug_grpc_channel->ReceiveAndProcessEventReplies(1);
833ecae82d1343df293fa36e67949e5404111817110Shanqing Cai  }
834ecae82d1343df293fa36e67949e5404111817110Shanqing Cai
835ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  return Status::OK();
836ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
837ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
8383c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugGrpcIO::IsReadGateOpen(const string& grpc_debug_url,
8393c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai                                 const string& watch_key) {
8403c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  const DebugNodeName2State* enabled_node_to_state =
8413c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      GetEnabledDebugOpStatesAtUrl(grpc_debug_url);
8423c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  return enabled_node_to_state->find(watch_key) != enabled_node_to_state->end();
8433c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai}
8443c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai
8453c482c66b5a1f74875969e96834ff7564e829668Shanqing Caibool DebugGrpcIO::IsWriteGateOpen(const string& grpc_debug_url,
8463c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai                                  const string& watch_key) {
8473c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  const DebugNodeName2State* enabled_node_to_state =
8483c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      GetEnabledDebugOpStatesAtUrl(grpc_debug_url);
8493c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  auto it = enabled_node_to_state->find(watch_key);
8503c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  if (it == enabled_node_to_state->end()) {
851ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    return false;
852ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  } else {
8533c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    return it->second == EventReply::DebugOpStateChange::READ_WRITE;
854ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
855ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
856ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
857aabc7972b94af5a678550427534d4fba7fda327cShanqing CaiStatus DebugGrpcIO::CloseGrpcStream(const string& grpc_stream_url) {
858ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  mutex_lock l(streams_mu);
859ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
860639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai  std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
861ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai      stream_channels = GetStreamChannels();
862ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  if (stream_channels->find(grpc_stream_url) != stream_channels->end()) {
863ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    // Stream of the specified address exists. Close it and remove it from
864ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    // record.
865639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai    Status s =
866639661f1a7ddb8c82898d0b4247bd1892f03c7aeShanqing Cai        (*stream_channels)[grpc_stream_url]->ReceiveServerRepliesAndClose();
867ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    (*stream_channels).erase(grpc_stream_url);
868ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return s;
869ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  } else {
870ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    // Stream of the specified address does not exist. No action.
871ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai    return Status::OK();
872ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai  }
873ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai}
874ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
8753c482c66b5a1f74875969e96834ff7564e829668Shanqing Caistd::unordered_map<string, DebugGrpcIO::DebugNodeName2State>*
8763c482c66b5a1f74875969e96834ff7564e829668Shanqing CaiDebugGrpcIO::GetEnabledDebugOpStates() {
8773c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  static std::unordered_map<string, DebugNodeName2State>*
8783c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      enabled_debug_op_states =
8793c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai          new std::unordered_map<string, DebugNodeName2State>();
8803c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  return enabled_debug_op_states;
881ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
882ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
8833c482c66b5a1f74875969e96834ff7564e829668Shanqing CaiDebugGrpcIO::DebugNodeName2State* DebugGrpcIO::GetEnabledDebugOpStatesAtUrl(
8843c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    const string& grpc_debug_url) {
885c1f69be22e151e2d051f41fccf436767eee4a26aShanqing Cai  static mutex* debug_ops_state_mu = new mutex();
8863c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  std::unordered_map<string, DebugNodeName2State>* states =
8873c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      GetEnabledDebugOpStates();
888c1f69be22e151e2d051f41fccf436767eee4a26aShanqing Cai
889c1f69be22e151e2d051f41fccf436767eee4a26aShanqing Cai  mutex_lock l(*debug_ops_state_mu);
8903c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  if (states->find(grpc_debug_url) == states->end()) {
8913c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    DebugNodeName2State url_enabled_debug_op_states;
8923c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    (*states)[grpc_debug_url] = url_enabled_debug_op_states;
893ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
8943c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  return &(*states)[grpc_debug_url];
895ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
896ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
8973c482c66b5a1f74875969e96834ff7564e829668Shanqing Caivoid DebugGrpcIO::SetDebugNodeKeyGrpcState(
8983c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    const string& grpc_debug_url, const string& watch_key,
8993c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    const EventReply::DebugOpStateChange::State new_state) {
9003c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  DebugNodeName2State* states = GetEnabledDebugOpStatesAtUrl(grpc_debug_url);
9013c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  if (new_state == EventReply::DebugOpStateChange::DISABLED) {
9023c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    if (states->find(watch_key) == states->end()) {
9033c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      LOG(ERROR) << "Attempt to disable a watch key that is not currently "
9043c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai                 << "enabled at " << grpc_debug_url << ": " << watch_key;
905ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    } else {
9063c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai      states->erase(watch_key);
907ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai    }
9083c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  } else if (new_state != EventReply::DebugOpStateChange::STATE_UNSPECIFIED) {
9093c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai    (*states)[watch_key] = new_state;
910ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai  }
911ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
912ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
9133c482c66b5a1f74875969e96834ff7564e829668Shanqing Caivoid DebugGrpcIO::ClearEnabledWatchKeys() {
9143c482c66b5a1f74875969e96834ff7564e829668Shanqing Cai  GetEnabledDebugOpStates()->clear();
915ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai}
916ecb5266e4791639781e4789a91cca8d3e00c4da7Shanqing Cai
91741803db36d4f4a3239bd81e5d460eb0e6e2eea88Shanqing Cai#endif  // #ifndef PLATFORM_WINDOWS
918ef2a926ec05dfd337d84279aafa58b22f0f36123Shanqing Cai
9199ccbae51231fcf6cfc8c4ef727790c21f7fed85cShanqing Cai}  // namespace tensorflow
920