example_proto_fast_parsing.cc revision 7705791619f5e851687e9a63b4315087e189f8be
1db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
3db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerLicensed under the Apache License, Version 2.0 (the "License");
4db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFloweryou may not use this file except in compliance with the License.
5db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerYou may obtain a copy of the License at
6db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
7db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    http://www.apache.org/licenses/LICENSE-2.0
8db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
9db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerUnless required by applicable law or agreed to in writing, software
10db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerdistributed under the License is distributed on an "AS IS" BASIS,
11db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerSee the License for the specific language governing permissions and
13db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerlimitations under the License.
14db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower==============================================================================*/
15db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/util/example_proto_fast_parsing.h"
16db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
17db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include <vector>
18db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
19db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/example/example.pb.h"
20db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/example/feature.pb_text.h"
21db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/framework/numeric_op.h"
22db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/framework/op_kernel.h"
23db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/framework/register_types.h"
24db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/lib/core/blocking_counter.h"
25db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/lib/core/casts.h"
26db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/lib/core/errors.h"
27db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/lib/core/threadpool.h"
287705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower#include "tensorflow/core/lib/gtl/inlined_vector.h"
29db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/platform/logging.h"
30db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/platform/protobuf.h"
31db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/util/presized_cuckoo_map.h"
32db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/util/sparse/sparse_tensor.h"
33db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
34db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowernamespace tensorflow {
35db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowernamespace example {
36db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
37db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowernamespace {
387705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower
397705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlowertemplate <typename T>
407705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlowerusing SmallVector = gtl::InlinedVector<T, 4>;
417705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower
42db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowertemplate <typename A>
43db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerauto EnableAliasing(A* a) -> decltype(a->EnableAliasing(true), void()) {
44db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  a->EnableAliasing(true);
45db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}
46db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
47db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowertemplate <typename A>
48db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowervoid EnableAliasing(A&& a) {}
49db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
50db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFloweruint8 PeekTag(protobuf::io::CodedInputStream* stream) {
51db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(stream != nullptr);
52db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  const void* ptr;
53db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  int size;
54db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!stream->GetDirectBufferPointer(&ptr, &size)) return 0;
55db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  return *static_cast<const uint8*>(ptr);
56db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}
57db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
58db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerconstexpr uint8 kVarintTag(uint tag) { return (tag << 3) | 0; }
59db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerconstexpr uint8 kDelimitedTag(uint tag) { return (tag << 3) | 2; }
60db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerconstexpr uint8 kFixed32Tag(uint tag) { return (tag << 3) | 5; }
61db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
62db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowernamespace parsed {
63db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
64db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower// ParseDataType has to be called first, then appropriate ParseZzzzList.
65db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerclass Feature {
66db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower public:
67db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  Feature() {}
68db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  Feature(StringPiece serialized) : serialized_(serialized) {}
69db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
70db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  Status ParseDataType(DataType* dtype) {
71db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    DCHECK(dtype != nullptr);
72db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (serialized_.empty()) {
73db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      *dtype = DT_INVALID;
74db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      return Status::OK();
75db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
76db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    uint8 oneof_tag = static_cast<uint8>(*serialized_.data());
77db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    serialized_.remove_prefix(1);
78db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    switch (oneof_tag) {
79db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      case kDelimitedTag(1):
80db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        *dtype = DT_STRING;
81db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        break;
82db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      case kDelimitedTag(2):
83db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        *dtype = DT_FLOAT;
84db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        break;
85db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      case kDelimitedTag(3):
86db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        *dtype = DT_INT64;
87db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        break;
88db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      default:
89db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        return errors::InvalidArgument("Unsuported datatype.");
90db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
91db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    return Status::OK();
92db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
93db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
947705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  bool ParseBytesList(SmallVector<string>* bytes_list) {
95db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    DCHECK(bytes_list != nullptr);
96db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    protobuf::io::CodedInputStream stream(
97db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
98db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
99db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    EnableAliasing(&stream);
100db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
101db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    uint32 length;
102db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (!stream.ReadVarint32(&length)) return false;
103db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    auto limit = stream.PushLimit(length);
104db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
105db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    while (!stream.ExpectAtEnd()) {
106db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (!stream.ExpectTag(kDelimitedTag(1))) return false;
107db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      // parse string
108db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      uint32 bytes_length;
109db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (!stream.ReadVarint32(&bytes_length)) return false;
110db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      string bytes;
111db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (!stream.ReadString(&bytes, bytes_length)) return false;
112db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      bytes_list->push_back(std::move(bytes));
113db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
114db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    stream.PopLimit(limit);
115db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    return true;
116db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
117db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
1187705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  bool ParseFloatList(SmallVector<float>* float_list) {
119db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    DCHECK(float_list != nullptr);
120db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    protobuf::io::CodedInputStream stream(
121db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
122db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    EnableAliasing(&stream);
123db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    uint32 length;
124db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (!stream.ReadVarint32(&length)) return false;
125db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    auto limit = stream.PushLimit(length);
126db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
127db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (!stream.ExpectAtEnd()) {
128db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      uint8 peek_tag = PeekTag(&stream);
129db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (peek_tag != kDelimitedTag(1) && peek_tag != kFixed32Tag(1)) {
130db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        return false;
131db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
132db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
133db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (peek_tag == kDelimitedTag(1)) {                       // packed
134db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        if (!stream.ExpectTag(kDelimitedTag(1))) return false;  // packed tag
135db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        uint32 packed_length;
136db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        if (!stream.ReadVarint32(&packed_length)) return false;
137db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        auto packed_limit = stream.PushLimit(packed_length);
138db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
139db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        while (!stream.ExpectAtEnd()) {
140db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          uint32 buffer32;
141db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (!stream.ReadLittleEndian32(&buffer32)) return false;
142db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          float_list->push_back(bit_cast<float>(buffer32));
143db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
144db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
145db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        stream.PopLimit(packed_limit);
146db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      } else {  // non-packed
147db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        while (!stream.ExpectAtEnd()) {
148db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (!stream.ExpectTag(kFixed32Tag(1))) return false;
149db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          uint32 buffer32;
150db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (!stream.ReadLittleEndian32(&buffer32)) return false;
151db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          float_list->push_back(bit_cast<float>(buffer32));
152db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
153db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
154db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
155db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
156db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    stream.PopLimit(limit);
157db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    return true;
158db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
159db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
1607705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  bool ParseInt64List(SmallVector<int64>* int64_list) {
161db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    DCHECK(int64_list != nullptr);
162db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    protobuf::io::CodedInputStream stream(
163db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
164db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    EnableAliasing(&stream);
165db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    uint32 length;
166db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (!stream.ReadVarint32(&length)) return false;
167db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    auto limit = stream.PushLimit(length);
168db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
169db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (!stream.ExpectAtEnd()) {
170db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      uint8 peek_tag = PeekTag(&stream);
171db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (peek_tag != kDelimitedTag(1) && peek_tag != kVarintTag(1)) {
172db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        return false;
173db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
174db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (peek_tag == kDelimitedTag(1)) {                       // packed
175db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        if (!stream.ExpectTag(kDelimitedTag(1))) return false;  // packed tag
176db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        uint32 packed_length;
177db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        if (!stream.ReadVarint32(&packed_length)) return false;
178db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        auto packed_limit = stream.PushLimit(packed_length);
179db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
180db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        while (!stream.ExpectAtEnd()) {
181967376bdf3ae9007f8b4c996a4a260a911dfc409A. Unique TensorFlower          protobuf_uint64 n;  // There is no API for int64
182db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (!stream.ReadVarint64(&n)) return false;
183db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          int64_list->push_back(n);
184db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
185db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
186db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        stream.PopLimit(packed_limit);
187db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      } else {  // non-packed
188db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        while (!stream.ExpectAtEnd()) {
189db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (!stream.ExpectTag(kVarintTag(1))) return false;
190967376bdf3ae9007f8b4c996a4a260a911dfc409A. Unique TensorFlower          protobuf_uint64 n;  // There is no API for int64
191db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (!stream.ReadVarint64(&n)) return false;
192db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          int64_list->push_back(n);
193db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
194db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
195db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
196db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    stream.PopLimit(limit);
197db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    return true;
198db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
199db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
200db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  StringPiece GetSerialized() const { return serialized_; }
201db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
202db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower private:
203db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // TODO(lew): Pair of uint8* would be more natural.
204db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  StringPiece serialized_;
205db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower};
206db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
207db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerusing FeatureMapEntry = std::pair<StringPiece, Feature>;
208db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerusing Example = std::vector<FeatureMapEntry>;
209db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
210db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}  // namespace parsed
211db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
212db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerbool ParseString(protobuf::io::CodedInputStream* stream, StringPiece* result) {
213db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(stream != nullptr);
214db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(result != nullptr);
215db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  uint32 length;
216db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!stream->ReadVarint32(&length)) return false;
217db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (length == 0) {
218db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    *result = StringPiece(nullptr, 0);
219db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    return true;
220db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
221db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  const void* stream_alias;
222db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  int stream_size;
223db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!stream->GetDirectBufferPointer(&stream_alias, &stream_size)) {
224db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    return false;
225db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
226db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (static_cast<uint32>(stream_size) < length) return false;
227db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  *result = StringPiece(static_cast<const char*>(stream_alias), length);
228db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  stream->Skip(length);
229db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  return true;
230db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}
231db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
232db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerbool ParseFeatureMapEntry(protobuf::io::CodedInputStream* stream,
233db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                          parsed::FeatureMapEntry* feature_map_entry) {
234db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(stream != nullptr);
235db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(feature_map_entry != nullptr);
236db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  uint32 length;
237db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!stream->ReadVarint32(&length)) return false;
238db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  auto limit = stream->PushLimit(length);
239db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!stream->ExpectTag(kDelimitedTag(1))) return false;
240db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!ParseString(stream, &feature_map_entry->first)) return false;
241db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!stream->ExpectTag(kDelimitedTag(2))) return false;
242db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  StringPiece feature_string_piece;
243db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!ParseString(stream, &feature_string_piece)) return false;
244db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  feature_map_entry->second = parsed::Feature(feature_string_piece);
245db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!stream->ExpectAtEnd()) return false;
246db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  stream->PopLimit(limit);
247db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  return true;
248db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}
249db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
250db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerbool ParseFeatures(protobuf::io::CodedInputStream* stream,
251db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                   parsed::Example* example) {
252db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(stream != nullptr);
253db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(example != nullptr);
254db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  uint32 length;
255db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!stream->ReadVarint32(&length)) return false;
256db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  auto limit = stream->PushLimit(length);
257db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  while (!stream->ExpectAtEnd()) {
258db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    parsed::FeatureMapEntry feature_map_entry;
259db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (!stream->ExpectTag(kDelimitedTag(1))) return false;
260db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (!ParseFeatureMapEntry(stream, &feature_map_entry)) return false;
261db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    example->push_back(std::move(feature_map_entry));
262db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
263db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  stream->PopLimit(limit);
264db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  return true;
265db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}
266db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
267db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerbool ParseExample(protobuf::io::CodedInputStream* stream,
268db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                  parsed::Example* example) {
269db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(stream != nullptr);
270db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(example != nullptr);
271db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (stream->ExpectTag(kDelimitedTag(1))) {
272db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (!ParseFeatures(stream, example)) return false;
273db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
274db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!stream->ExpectAtEnd()) return false;
275db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  return true;
276db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}
277db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
278db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerbool ParseExample(StringPiece serialized, parsed::Example* example) {
279db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(example != nullptr);
280db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  protobuf::io::CodedInputStream stream(
281db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      reinterpret_cast<const uint8*>(serialized.data()), serialized.size());
282db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  EnableAliasing(&stream);
283db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  return ParseExample(&stream, example);
284db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}
285db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
286db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}  // namespace
287db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
288db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerbool TestFastParse(const string& serialized, Example* example) {
289db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(example != nullptr);
290db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  parsed::Example parsed_example;
291db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!ParseExample(serialized, &parsed_example)) return false;
292db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  auto& features = *example->mutable_features();
293db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  for (parsed::FeatureMapEntry& entry : parsed_example) {
294db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    auto& value = (*features.mutable_feature())[entry.first.ToString()];
295db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    DataType dtype;
296db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (!entry.second.ParseDataType(&dtype).ok()) return false;
297db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    switch (dtype) {
298db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      case DT_INVALID:
299db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        break;
300db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      case DT_STRING: {
3017705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower        SmallVector<string> list;
302db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        if (!entry.second.ParseBytesList(&list)) return false;
303db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        auto* result_list = value.mutable_bytes_list();
304db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        for (auto& bytes : list) {
305db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          result_list->add_value(std::move(bytes));
306db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
307db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        break;
308db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
309db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      case DT_FLOAT: {
3107705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower        SmallVector<float> list;
311db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        if (!entry.second.ParseFloatList(&list)) return false;
312db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        auto* result_list = value.mutable_float_list();
313db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        for (float f : list) {
314db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          result_list->add_value(f);
315db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
316db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        break;
317db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
318db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      case DT_INT64: {
3197705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower        SmallVector<int64> list;
320db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        if (!entry.second.ParseInt64List(&list)) return false;
321db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        auto* result_list = value.mutable_int64_list();
322db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        for (int64 i : list) {
323db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          result_list->add_value(i);
324db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
325db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        break;
326db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
327db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      default:
328db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        CHECK(false) << "Should not happen.";
329db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
330db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
331db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  return true;
332db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}
333db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
334db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower// -----------------------------------------------------------------------------
335db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
336db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowernamespace {
337db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
338db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerusing Config = FastParseExampleConfig;
339db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
340db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowervoid ParallelFor(const std::function<void(size_t)>& f, size_t n,
341db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                 thread::ThreadPool* thread_pool) {
342db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (n == 0) return;
3437705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  if (thread_pool == nullptr) {
3447705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    for (size_t i = 0; i < n; ++i) {
345db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      f(i);
3467705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    }
3477705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  } else {
3487705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    BlockingCounter counter(n - 1);
3497705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    for (size_t i = 1; i < n; ++i) {
3507705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower      thread_pool->Schedule([i, &f, &counter] {
3517705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower        f(i);
3527705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower        counter.DecrementCount();
3537705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower      });
3547705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    }
3557705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    f(0);
3567705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    counter.Wait();
357db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
358db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}
359db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
360db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerenum class Type { Sparse, Dense };
361db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
362db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerstruct SparseBuffer {
363db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // Features are in one of the 3 vectors below depending on config's dtype.
364db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // Other 2 vectors remain empty.
3657705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  SmallVector<string> bytes_list;
3667705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  SmallVector<float> float_list;
3677705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  SmallVector<int64> int64_list;
368db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
369db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // Features of example i are elements with indices
370db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // from example_end_indices[i-1] to example_end_indices[i]-1 on the
371db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // appropriate xxxxx_list
372db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  std::vector<size_t> example_end_indices;
373db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower};
374db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
375db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerstruct SeededHasher {
376db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  uint64 operator()(StringPiece s) const {
377db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    return Hash64(s.data(), s.size(), seed);
378db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
379db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  uint64 seed{0xDECAFCAFFE};
380db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower};
381db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
382db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerStatus FastParseSerializedExample(
383db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    const string& serialized_example, const string& example_name,
384db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    const size_t example_index, const Config& config,
385db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    const PresizedCuckooMap<std::pair<size_t, Type>>& config_index,
386db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    SeededHasher hasher, std::vector<Tensor>* output_dense,
387db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    std::vector<SparseBuffer>* output_sparse) {
388db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(output_dense != nullptr);
389db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(output_sparse != nullptr);
390db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  parsed::Example parsed_example;
391db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!ParseExample(serialized_example, &parsed_example)) {
392db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    return errors::InvalidArgument("Could not parse example input, value: '",
393db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                                   serialized_example, "'");
394db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
395db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  constexpr size_t kMax = std::numeric_limits<size_t>::max();
396db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  std::vector<size_t> sparse_features_found(config.sparse.size(), kMax);
397db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  std::vector<size_t> dense_features_found(config.dense.size(), kMax);
398db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
399db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // Handle features present in the example.
400db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  for (parsed::FeatureMapEntry& name_and_feature : parsed_example) {
401db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    parsed::Feature& feature = name_and_feature.second;
402db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    std::pair<size_t, Type> d_and_type;
403db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    uint64 h = hasher(name_and_feature.first);
404db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (!config_index.Find(h, &d_and_type)) continue;
405db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    size_t d = d_and_type.first;
406db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
407db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    auto parse_error = [&](StringPiece feature_name) {
408db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      return errors::InvalidArgument("Name: ", example_name, ", Key: ",
409db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                                     feature_name, ", Index: ", example_index,
410db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                                     ". Can't parse serialized Example.");
411db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    };
412db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
413db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (d_and_type.second == Type::Dense) {
414db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      DataType example_dtype;
415db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      TF_RETURN_IF_ERROR(feature.ParseDataType(&example_dtype));
416db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (example_dtype == DT_INVALID) continue;
417db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
418db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      dense_features_found[d] = example_index;
419db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (example_dtype != config.dense[d].dtype) {
420db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        return errors::InvalidArgument(
421db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            "Name: ", example_name, ", Feature: ", config.dense[d].feature_name,
422db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            ".  Data types don't match. ", "Data type: ",
423db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            DataTypeString(example_dtype), "Expected type: ",
424db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            DataTypeString(config.dense[d].dtype));
425db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
426db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      const string& feature_name = config.dense[d].feature_name;
427db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      const TensorShape& shape = config.dense[d].shape;
428db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      Tensor& out = (*output_dense)[d];
429db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
430db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      const std::size_t num_elements = shape.num_elements();
431db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      const std::size_t offset = example_index * num_elements;
432db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
433db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      auto shape_error = [&](size_t size, StringPiece type_str) {
434db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        return errors::InvalidArgument(
435db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            "Name: ", example_name, ", Key: ", feature_name, ", Index: ",
436db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            example_index, ".  Number of ", type_str,
437db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            " values != expected.  "
438db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            "Values size: ",
439db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            size, " but output shape: ", shape.DebugString());
440db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      };
441db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
442db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      switch (config.dense[d].dtype) {
443db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        case DT_INT64: {
4447705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower          SmallVector<int64> list;
445db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (!feature.ParseInt64List(&list)) return parse_error(feature_name);
446db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (list.size() != num_elements) {
447db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            return shape_error(list.size(), "int64");
448db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          }
449db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          auto out_p = out.flat<int64>().data() + offset;
450db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          std::copy_n(list.begin(), list.size(), out_p);
451db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          break;
452db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
453db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        case DT_FLOAT: {
4547705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower          SmallVector<float> list;
455db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (!feature.ParseFloatList(&list)) return parse_error(feature_name);
456db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (list.size() != num_elements) {
457db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            return shape_error(list.size(), "float");
458db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          }
459db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          auto out_p = out.flat<float>().data() + offset;
460db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          std::copy_n(list.begin(), list.size(), out_p);
461db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          break;
462db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
463db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        case DT_STRING: {
4647705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower          SmallVector<string> list;
465db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (!feature.ParseBytesList(&list)) return parse_error(feature_name);
466db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (list.size() != num_elements) {
467db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            return shape_error(list.size(), "bytes");
468db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          }
469db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          auto out_p = out.flat<string>().data() + offset;
470db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          for (size_t i = 0; i < list.size(); ++i) {
471db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            out_p[i] = std::move(list[i]);
472db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          }
473db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          break;
474db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
475db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        default:
476db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          CHECK(false) << "Should not happen.";
477db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
478db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    } else {
479db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      // Handle sparse features.
480db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      sparse_features_found[d] = example_index;
481db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      const string& feature_name = config.sparse[d].feature_name;
482db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      SparseBuffer& out = (*output_sparse)[d];
483db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      DataType example_dtype;
484db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      TF_RETURN_IF_ERROR(feature.ParseDataType(&example_dtype));
485db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (example_dtype != DT_INVALID &&
486db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          example_dtype != config.sparse[d].dtype) {
487db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        return errors::InvalidArgument(
488db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            "Name: ", example_name, ", Feature: ",
489db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            config.sparse[d].feature_name, ".  Data types don't match. ",
490db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            "Expected type: ", DataTypeString(config.sparse[d].dtype));
491db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
492db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
493db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      switch (config.sparse[d].dtype) {
494db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        case DT_INT64: {
495db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (example_dtype != DT_INVALID) {
496db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            if (!feature.ParseInt64List(&out.int64_list)) {
497db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower              return parse_error(feature_name);
498db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            }
499db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          }
500db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          out.example_end_indices.push_back(out.int64_list.size());
501db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          break;
502db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
503db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        case DT_FLOAT: {
504db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (example_dtype != DT_INVALID) {
505db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            if (!feature.ParseFloatList(&out.float_list)) {
506db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower              return parse_error(feature_name);
507db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            }
508db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          }
509db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          out.example_end_indices.push_back(out.float_list.size());
510db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          break;
511db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
512db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        case DT_STRING: {
513db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          if (example_dtype != DT_INVALID) {
514db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            if (!feature.ParseBytesList(&out.bytes_list)) {
515db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower              return parse_error(feature_name);
516db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower            }
517db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          }
518db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          out.example_end_indices.push_back(out.bytes_list.size());
519db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          break;
520db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
521db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        default:
522db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          CHECK(false) << "Should not happen.";
523db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
524db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
525db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
526db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
527db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // Handle missing dense features.
528db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  for (size_t d = 0; d < config.dense.size(); ++d) {
529db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (dense_features_found[d] == example_index) continue;
530db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (config.dense[d].default_value.NumElements() == 0) {
531db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      return errors::InvalidArgument("Name: ", example_name, ", Feature: ",
532db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                                     config.dense[d].feature_name,
533db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                                     " is required but could not be found.");
534db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
535db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
536db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    const Tensor& in = config.dense[d].default_value;
537db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    Tensor& out = (*output_dense)[d];
538db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    const std::size_t num_elements = in.shape().num_elements();
539db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    const std::size_t offset = example_index * num_elements;
540db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
541db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    switch (config.dense[d].dtype) {
542db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      case DT_INT64: {
543db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        std::copy_n(in.flat<int64>().data(), num_elements,
544db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                    out.flat<int64>().data() + offset);
545db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        break;
546db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
547db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      case DT_FLOAT: {
548db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        std::copy_n(in.flat<float>().data(), num_elements,
549db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                    out.flat<float>().data() + offset);
550db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        break;
551db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
552db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      case DT_STRING: {
553db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        std::copy_n(in.flat<string>().data(), num_elements,
554db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                    out.flat<string>().data() + offset);
555db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        break;
556db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
557db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      default:
558db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        CHECK(false) << "Should not happen.";
559db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
560db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
561db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
562db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // Handle missing sparse features.
563db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  for (size_t d = 0; d < config.sparse.size(); ++d) {
564db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (sparse_features_found[d] == example_index) continue;
565db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    SparseBuffer& out = (*output_sparse)[d];
566db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    size_t prev_example_end_index =
567db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        out.example_end_indices.empty() ? 0 : out.example_end_indices.back();
568db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    out.example_end_indices.push_back(prev_example_end_index);
569db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
570db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
571db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  return Status::OK();
572db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}
573db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
574db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerStatus CheckConfigDataType(DataType dtype) {
575db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  switch (dtype) {
576db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    case DT_INT64:
577db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    case DT_FLOAT:
578db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    case DT_STRING:
579db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      return Status::OK();
580db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    default:
581db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      return errors::InvalidArgument("Invalid config dtype: ",
582db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                                     DataTypeString(dtype));
583db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
584db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}
585db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
586db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}  // namespace
587db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
588db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerStatus FastParseExample(const Config& config,
589db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                        gtl::ArraySlice<string> serialized,
590db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                        gtl::ArraySlice<string> example_names,
591db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                        thread::ThreadPool* thread_pool, Result* result) {
592db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  DCHECK(result != nullptr);
593db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // Check config so we can safely CHECK(false) in switches on config.*.dtype
594db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  for (auto& c : config.sparse) {
595db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype));
596db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
597db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  for (auto& c : config.dense) {
598db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype));
599db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
600db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
601db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  size_t config_size = config.dense.size() + config.sparse.size();
602db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  SeededHasher hasher;
603db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // Build config index.
604db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  PresizedCuckooMap<std::pair<size_t, Type>> config_index(config_size);
605db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  bool ok = true;
606db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  for (size_t i = 0; i < 1000; ++i) {
607db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    for (size_t d = 0; d < config.dense.size(); ++d) {
608db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      ok &= config_index.InsertUnique(hasher(config.dense[d].feature_name),
609db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                                      {d, Type::Dense});
610db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
611db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    for (size_t d = 0; d < config.sparse.size(); ++d) {
612db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      ok &= config_index.InsertUnique(hasher(config.sparse[d].feature_name),
613db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                                      {d, Type::Sparse});
614db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
615db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    if (ok) break;
616db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    LOG(WARNING) << "Collision found. This should happen only if you have "
617db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                    "around 2^32 entries in your config.";
618db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    hasher.seed++;
619db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    config_index.Clear(config_size);
620db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
621db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  if (!ok) {
622db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    return errors::Internal(
623db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        "Could not avoid collision. This should not happen.");
624db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
625db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
626db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // Allocate dense output (sparse have to be buffered).
627db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  for (size_t d = 0; d < config.dense.size(); ++d) {
628db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    TensorShape out_shape;
629db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    out_shape.AddDim(serialized.size());
630db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    for (const int64 dim : config.dense[d].shape.dim_sizes()) {
631db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      out_shape.AddDim(dim);
632db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
633db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    result->dense_values.emplace_back(config.dense[d].dtype, out_shape);
634db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
635db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
636db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // This parameter affects performance in a big and data-dependent way.
6377705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  const size_t kMiniBatchSizeBytes = 50000;
638db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
6397705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  // Calculate number of minibatches.
6407705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  // In main regime make each minibatch around kMiniBatchSizeBytes bytes.
6417705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  // Apply 'special logic' below for small and big regimes.
6427705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  const size_t num_minibatches = [&] {
6437705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    size_t result = 0;
644db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    size_t minibatch_bytes = 0;
645db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    for (size_t i = 0; i < serialized.size(); i++) {
646db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (minibatch_bytes == 0) {  // start minibatch
6477705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower        result++;
648db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
649db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      minibatch_bytes += serialized[i].size() + 1;
650db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (minibatch_bytes > kMiniBatchSizeBytes) {
651db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        minibatch_bytes = 0;
652db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
653db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
6547705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    // 'special logic'
6557705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    const size_t min_minibatches = std::min<size_t>(8, serialized.size());
6567705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    const size_t max_minibatches = 64;
6577705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    return std::max<size_t>(min_minibatches,
6587705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower                            std::min<size_t>(max_minibatches, result));
659db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }();
660db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
6617705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  auto first_example_of_minibatch = [&](size_t minibatch) -> size_t {
6627705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    return (serialized.size() * minibatch) / num_minibatches;
6637705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  };
6647705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower
6657705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  // TODO(lew): A big performance low-hanging fruit here is to improve
6667705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  //   num_minibatches calculation to take into account actual amount of work
6677705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  //   needed, as the size in bytes is not perfect. Linear combination of
6687705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  //   size in bytes and average number of features per example is promising.
6697705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  //   Even better: measure time instead of estimating, but this is too costly
6707705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  //   in small batches.
6717705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower  //   Maybe accept outside parameter #num_minibatches?
672db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
673db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // Do minibatches in parallel.
674db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  std::vector<std::vector<SparseBuffer>> sparse_buffers(num_minibatches);
675db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  std::vector<Status> status_of_minibatch(num_minibatches);
676db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  auto ProcessMiniBatch = [&](size_t minibatch) {
677db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    sparse_buffers[minibatch].resize(config.sparse.size());
6787705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    size_t start = first_example_of_minibatch(minibatch);
6797705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower    size_t end = first_example_of_minibatch(minibatch + 1);
680db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    for (size_t e = start; e < end; ++e) {
681db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      status_of_minibatch[minibatch] = FastParseSerializedExample(
682db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          serialized[e],
683db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          (example_names.size() > 0 ? example_names[e] : "<unknown>"), e,
684db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          config, config_index, hasher, &result->dense_values,
685db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          &sparse_buffers[minibatch]);
686db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      if (!status_of_minibatch[minibatch].ok()) break;
687db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
688db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  };
689db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
690db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  ParallelFor(ProcessMiniBatch, num_minibatches, thread_pool);
691db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
692db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  for (Status& status : status_of_minibatch) {
693db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    TF_RETURN_IF_ERROR(status);
694db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
695db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
696db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  // Merge SparseBuffers from all minibatches for every config.sparse.
697db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  auto MergeMinibatches = [&](size_t d) {
698db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    // Loop over minibatches
699db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    size_t total_num_features = 0;
700db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    size_t max_num_features = 0;
701db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    for (auto& sparse_values_tmp : sparse_buffers) {
702db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      std::vector<size_t>& end_indices =
703db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          sparse_values_tmp[d].example_end_indices;
704db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      total_num_features += end_indices.back();
705db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      max_num_features = std::max(max_num_features, end_indices[0]);
706db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      for (size_t i = 1; i < end_indices.size(); ++i) {
707db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        size_t example_size = end_indices[i] - end_indices[i - 1];
708db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        max_num_features = std::max(max_num_features, example_size);
709db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
710db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
711db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
712db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    TensorShape indices_shape;
713db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    indices_shape.AddDim(total_num_features);
714db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    indices_shape.AddDim(2);
715db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    result->sparse_indices.emplace_back(DT_INT64, indices_shape);
716db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    Tensor* indices = &result->sparse_indices.back();
717db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
718db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    TensorShape values_shape;
719db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    values_shape.AddDim(total_num_features);
720db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    result->sparse_values.emplace_back(config.sparse[d].dtype, values_shape);
721db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    Tensor* values = &result->sparse_values.back();
722db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
723db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    result->sparse_shapes.emplace_back(DT_INT64, TensorShape({2}));
724db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    auto shapes_shape_t = result->sparse_shapes.back().vec<int64>();
725db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    shapes_shape_t(0) = serialized.size();
726db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    shapes_shape_t(1) = max_num_features;
727db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
728db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    size_t offset = 0;
729db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    for (size_t i = 0; i < sparse_buffers.size(); ++i) {
730db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      const SparseBuffer& buffer = sparse_buffers[i][d];
731db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
732db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      // Update indices.
733db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      int64* ix_p = &indices->matrix<int64>()(offset, 0);
734db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      size_t delta = 0;
7357705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower      size_t example_index = first_example_of_minibatch(i);
736db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      for (size_t example_end_index : buffer.example_end_indices) {
737db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        size_t feature_index = 0;
738db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        for (; delta < example_end_index; ++delta) {
739db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          // Column 0: example index
740db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          *ix_p = example_index;
741db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          // Column 1: the feature index buffer example
742db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          *(ix_p + 1) = feature_index;
743db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          ix_p += 2;
744db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          ++feature_index;
745db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
746db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        ++example_index;
747db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
748db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
749db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      // Copy values over.
750db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      switch (config.sparse[d].dtype) {
751db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        case DT_INT64: {
752db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          std::copy(buffer.int64_list.begin(), buffer.int64_list.end(),
753db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                    values->flat<int64>().data() + offset);
754db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          break;
755db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
756db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        case DT_FLOAT: {
757db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          std::copy(buffer.float_list.begin(), buffer.float_list.end(),
758db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                    values->flat<float>().data() + offset);
759db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          break;
760db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
761db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        case DT_STRING: {
762db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          std::move(buffer.bytes_list.begin(), buffer.bytes_list.end(),
763db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower                    values->flat<string>().data() + offset);
764db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          break;
765db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        }
766db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower        default:
767db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower          CHECK(false) << "Should not happen.";
768db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      }
769db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
770db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower      offset += delta;
771db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    }
772db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  };
773db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
774db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  for (size_t d = 0; d < config.sparse.size(); ++d) {
775db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower    MergeMinibatches(d);
776db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  }
777db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
778db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower  return Status::OK();
779db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}
780db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower
781db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}  // namespace example
782db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}  // namespace tensorflow
783