example_proto_fast_parsing.cc revision 7705791619f5e851687e9a63b4315087e189f8be
1db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 3db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerLicensed under the Apache License, Version 2.0 (the "License"); 4db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFloweryou may not use this file except in compliance with the License. 5db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerYou may obtain a copy of the License at 6db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 7db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower http://www.apache.org/licenses/LICENSE-2.0 8db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 9db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerUnless required by applicable law or agreed to in writing, software 10db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerdistributed under the License is distributed on an "AS IS" BASIS, 11db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerSee the License for the specific language governing permissions and 13db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerlimitations under the License. 14db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower==============================================================================*/ 15db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/util/example_proto_fast_parsing.h" 16db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 17db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include <vector> 18db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 19db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/example/example.pb.h" 20db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/example/feature.pb_text.h" 21db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/framework/numeric_op.h" 22db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/framework/op_kernel.h" 23db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/framework/register_types.h" 24db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/lib/core/blocking_counter.h" 25db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/lib/core/casts.h" 26db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/lib/core/errors.h" 27db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/lib/core/threadpool.h" 287705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower#include "tensorflow/core/lib/gtl/inlined_vector.h" 29db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/platform/logging.h" 30db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/platform/protobuf.h" 31db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/util/presized_cuckoo_map.h" 32db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower#include "tensorflow/core/util/sparse/sparse_tensor.h" 33db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 34db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowernamespace tensorflow { 35db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowernamespace example { 36db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 37db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowernamespace { 387705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower 397705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlowertemplate <typename T> 407705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlowerusing SmallVector = gtl::InlinedVector<T, 4>; 417705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower 42db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowertemplate <typename A> 43db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerauto EnableAliasing(A* a) -> decltype(a->EnableAliasing(true), void()) { 44db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower a->EnableAliasing(true); 45db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} 46db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 47db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowertemplate <typename A> 48db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowervoid EnableAliasing(A&& a) {} 49db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 50db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFloweruint8 PeekTag(protobuf::io::CodedInputStream* stream) { 51db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(stream != nullptr); 52db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const void* ptr; 53db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower int size; 54db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream->GetDirectBufferPointer(&ptr, &size)) return 0; 55db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return *static_cast<const uint8*>(ptr); 56db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} 57db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 58db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerconstexpr uint8 kVarintTag(uint tag) { return (tag << 3) | 0; } 59db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerconstexpr uint8 kDelimitedTag(uint tag) { return (tag << 3) | 2; } 60db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerconstexpr uint8 kFixed32Tag(uint tag) { return (tag << 3) | 5; } 61db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 62db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowernamespace parsed { 63db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 64db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower// ParseDataType has to be called first, then appropriate ParseZzzzList. 65db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerclass Feature { 66db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower public: 67db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower Feature() {} 68db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower Feature(StringPiece serialized) : serialized_(serialized) {} 69db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 70db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower Status ParseDataType(DataType* dtype) { 71db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(dtype != nullptr); 72db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (serialized_.empty()) { 73db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower *dtype = DT_INVALID; 74db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return Status::OK(); 75db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 76db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint8 oneof_tag = static_cast<uint8>(*serialized_.data()); 77db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower serialized_.remove_prefix(1); 78db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower switch (oneof_tag) { 79db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case kDelimitedTag(1): 80db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower *dtype = DT_STRING; 81db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 82db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case kDelimitedTag(2): 83db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower *dtype = DT_FLOAT; 84db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 85db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case kDelimitedTag(3): 86db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower *dtype = DT_INT64; 87db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 88db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower default: 89db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return errors::InvalidArgument("Unsuported datatype."); 90db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 91db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return Status::OK(); 92db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 93db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 947705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower bool ParseBytesList(SmallVector<string>* bytes_list) { 95db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(bytes_list != nullptr); 96db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower protobuf::io::CodedInputStream stream( 97db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size()); 98db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 99db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower EnableAliasing(&stream); 100db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 101db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint32 length; 102db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ReadVarint32(&length)) return false; 103db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto limit = stream.PushLimit(length); 104db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 105db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower while (!stream.ExpectAtEnd()) { 106db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ExpectTag(kDelimitedTag(1))) return false; 107db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // parse string 108db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint32 bytes_length; 109db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ReadVarint32(&bytes_length)) return false; 110db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower string bytes; 111db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ReadString(&bytes, bytes_length)) return false; 112db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower bytes_list->push_back(std::move(bytes)); 113db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 114db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower stream.PopLimit(limit); 115db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return true; 116db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 117db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 1187705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower bool ParseFloatList(SmallVector<float>* float_list) { 119db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(float_list != nullptr); 120db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower protobuf::io::CodedInputStream stream( 121db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size()); 122db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower EnableAliasing(&stream); 123db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint32 length; 124db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ReadVarint32(&length)) return false; 125db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto limit = stream.PushLimit(length); 126db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 127db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ExpectAtEnd()) { 128db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint8 peek_tag = PeekTag(&stream); 129db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (peek_tag != kDelimitedTag(1) && peek_tag != kFixed32Tag(1)) { 130db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return false; 131db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 132db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 133db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (peek_tag == kDelimitedTag(1)) { // packed 134db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ExpectTag(kDelimitedTag(1))) return false; // packed tag 135db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint32 packed_length; 136db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ReadVarint32(&packed_length)) return false; 137db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto packed_limit = stream.PushLimit(packed_length); 138db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 139db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower while (!stream.ExpectAtEnd()) { 140db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint32 buffer32; 141db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ReadLittleEndian32(&buffer32)) return false; 142db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower float_list->push_back(bit_cast<float>(buffer32)); 143db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 144db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 145db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower stream.PopLimit(packed_limit); 146db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } else { // non-packed 147db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower while (!stream.ExpectAtEnd()) { 148db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ExpectTag(kFixed32Tag(1))) return false; 149db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint32 buffer32; 150db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ReadLittleEndian32(&buffer32)) return false; 151db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower float_list->push_back(bit_cast<float>(buffer32)); 152db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 153db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 154db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 155db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 156db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower stream.PopLimit(limit); 157db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return true; 158db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 159db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 1607705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower bool ParseInt64List(SmallVector<int64>* int64_list) { 161db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(int64_list != nullptr); 162db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower protobuf::io::CodedInputStream stream( 163db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size()); 164db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower EnableAliasing(&stream); 165db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint32 length; 166db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ReadVarint32(&length)) return false; 167db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto limit = stream.PushLimit(length); 168db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 169db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ExpectAtEnd()) { 170db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint8 peek_tag = PeekTag(&stream); 171db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (peek_tag != kDelimitedTag(1) && peek_tag != kVarintTag(1)) { 172db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return false; 173db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 174db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (peek_tag == kDelimitedTag(1)) { // packed 175db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ExpectTag(kDelimitedTag(1))) return false; // packed tag 176db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint32 packed_length; 177db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ReadVarint32(&packed_length)) return false; 178db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto packed_limit = stream.PushLimit(packed_length); 179db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 180db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower while (!stream.ExpectAtEnd()) { 181967376bdf3ae9007f8b4c996a4a260a911dfc409A. Unique TensorFlower protobuf_uint64 n; // There is no API for int64 182db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ReadVarint64(&n)) return false; 183db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower int64_list->push_back(n); 184db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 185db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 186db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower stream.PopLimit(packed_limit); 187db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } else { // non-packed 188db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower while (!stream.ExpectAtEnd()) { 189db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ExpectTag(kVarintTag(1))) return false; 190967376bdf3ae9007f8b4c996a4a260a911dfc409A. Unique TensorFlower protobuf_uint64 n; // There is no API for int64 191db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream.ReadVarint64(&n)) return false; 192db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower int64_list->push_back(n); 193db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 194db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 195db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 196db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower stream.PopLimit(limit); 197db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return true; 198db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 199db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 200db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower StringPiece GetSerialized() const { return serialized_; } 201db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 202db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower private: 203db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // TODO(lew): Pair of uint8* would be more natural. 204db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower StringPiece serialized_; 205db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}; 206db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 207db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerusing FeatureMapEntry = std::pair<StringPiece, Feature>; 208db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerusing Example = std::vector<FeatureMapEntry>; 209db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 210db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} // namespace parsed 211db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 212db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerbool ParseString(protobuf::io::CodedInputStream* stream, StringPiece* result) { 213db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(stream != nullptr); 214db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(result != nullptr); 215db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint32 length; 216db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream->ReadVarint32(&length)) return false; 217db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (length == 0) { 218db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower *result = StringPiece(nullptr, 0); 219db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return true; 220db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 221db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const void* stream_alias; 222db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower int stream_size; 223db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream->GetDirectBufferPointer(&stream_alias, &stream_size)) { 224db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return false; 225db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 226db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (static_cast<uint32>(stream_size) < length) return false; 227db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower *result = StringPiece(static_cast<const char*>(stream_alias), length); 228db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower stream->Skip(length); 229db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return true; 230db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} 231db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 232db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerbool ParseFeatureMapEntry(protobuf::io::CodedInputStream* stream, 233db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower parsed::FeatureMapEntry* feature_map_entry) { 234db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(stream != nullptr); 235db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(feature_map_entry != nullptr); 236db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint32 length; 237db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream->ReadVarint32(&length)) return false; 238db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto limit = stream->PushLimit(length); 239db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream->ExpectTag(kDelimitedTag(1))) return false; 240db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!ParseString(stream, &feature_map_entry->first)) return false; 241db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream->ExpectTag(kDelimitedTag(2))) return false; 242db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower StringPiece feature_string_piece; 243db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!ParseString(stream, &feature_string_piece)) return false; 244db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower feature_map_entry->second = parsed::Feature(feature_string_piece); 245db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream->ExpectAtEnd()) return false; 246db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower stream->PopLimit(limit); 247db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return true; 248db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} 249db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 250db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerbool ParseFeatures(protobuf::io::CodedInputStream* stream, 251db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower parsed::Example* example) { 252db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(stream != nullptr); 253db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(example != nullptr); 254db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint32 length; 255db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream->ReadVarint32(&length)) return false; 256db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto limit = stream->PushLimit(length); 257db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower while (!stream->ExpectAtEnd()) { 258db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower parsed::FeatureMapEntry feature_map_entry; 259db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream->ExpectTag(kDelimitedTag(1))) return false; 260db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!ParseFeatureMapEntry(stream, &feature_map_entry)) return false; 261db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower example->push_back(std::move(feature_map_entry)); 262db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 263db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower stream->PopLimit(limit); 264db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return true; 265db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} 266db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 267db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerbool ParseExample(protobuf::io::CodedInputStream* stream, 268db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower parsed::Example* example) { 269db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(stream != nullptr); 270db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(example != nullptr); 271db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (stream->ExpectTag(kDelimitedTag(1))) { 272db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!ParseFeatures(stream, example)) return false; 273db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 274db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!stream->ExpectAtEnd()) return false; 275db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return true; 276db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} 277db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 278db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerbool ParseExample(StringPiece serialized, parsed::Example* example) { 279db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(example != nullptr); 280db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower protobuf::io::CodedInputStream stream( 281db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower reinterpret_cast<const uint8*>(serialized.data()), serialized.size()); 282db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower EnableAliasing(&stream); 283db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return ParseExample(&stream, example); 284db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} 285db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 286db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} // namespace 287db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 288db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerbool TestFastParse(const string& serialized, Example* example) { 289db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(example != nullptr); 290db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower parsed::Example parsed_example; 291db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!ParseExample(serialized, &parsed_example)) return false; 292db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto& features = *example->mutable_features(); 293db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (parsed::FeatureMapEntry& entry : parsed_example) { 294db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto& value = (*features.mutable_feature())[entry.first.ToString()]; 295db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DataType dtype; 296db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!entry.second.ParseDataType(&dtype).ok()) return false; 297db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower switch (dtype) { 298db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_INVALID: 299db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 300db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_STRING: { 3017705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower SmallVector<string> list; 302db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!entry.second.ParseBytesList(&list)) return false; 303db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto* result_list = value.mutable_bytes_list(); 304db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (auto& bytes : list) { 305db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower result_list->add_value(std::move(bytes)); 306db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 307db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 308db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 309db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_FLOAT: { 3107705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower SmallVector<float> list; 311db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!entry.second.ParseFloatList(&list)) return false; 312db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto* result_list = value.mutable_float_list(); 313db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (float f : list) { 314db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower result_list->add_value(f); 315db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 316db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 317db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 318db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_INT64: { 3197705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower SmallVector<int64> list; 320db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!entry.second.ParseInt64List(&list)) return false; 321db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto* result_list = value.mutable_int64_list(); 322db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (int64 i : list) { 323db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower result_list->add_value(i); 324db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 325db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 326db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 327db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower default: 328db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower CHECK(false) << "Should not happen."; 329db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 330db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 331db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return true; 332db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} 333db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 334db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower// ----------------------------------------------------------------------------- 335db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 336db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowernamespace { 337db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 338db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerusing Config = FastParseExampleConfig; 339db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 340db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowervoid ParallelFor(const std::function<void(size_t)>& f, size_t n, 341db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower thread::ThreadPool* thread_pool) { 342db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (n == 0) return; 3437705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower if (thread_pool == nullptr) { 3447705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower for (size_t i = 0; i < n; ++i) { 345db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower f(i); 3467705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower } 3477705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower } else { 3487705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower BlockingCounter counter(n - 1); 3497705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower for (size_t i = 1; i < n; ++i) { 3507705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower thread_pool->Schedule([i, &f, &counter] { 3517705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower f(i); 3527705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower counter.DecrementCount(); 3537705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower }); 3547705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower } 3557705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower f(0); 3567705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower counter.Wait(); 357db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 358db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} 359db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 360db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerenum class Type { Sparse, Dense }; 361db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 362db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerstruct SparseBuffer { 363db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Features are in one of the 3 vectors below depending on config's dtype. 364db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Other 2 vectors remain empty. 3657705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower SmallVector<string> bytes_list; 3667705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower SmallVector<float> float_list; 3677705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower SmallVector<int64> int64_list; 368db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 369db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Features of example i are elements with indices 370db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // from example_end_indices[i-1] to example_end_indices[i]-1 on the 371db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // appropriate xxxxx_list 372db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::vector<size_t> example_end_indices; 373db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}; 374db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 375db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerstruct SeededHasher { 376db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint64 operator()(StringPiece s) const { 377db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return Hash64(s.data(), s.size(), seed); 378db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 379db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint64 seed{0xDECAFCAFFE}; 380db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower}; 381db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 382db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerStatus FastParseSerializedExample( 383db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const string& serialized_example, const string& example_name, 384db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const size_t example_index, const Config& config, 385db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const PresizedCuckooMap<std::pair<size_t, Type>>& config_index, 386db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower SeededHasher hasher, std::vector<Tensor>* output_dense, 387db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::vector<SparseBuffer>* output_sparse) { 388db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(output_dense != nullptr); 389db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(output_sparse != nullptr); 390db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower parsed::Example parsed_example; 391db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!ParseExample(serialized_example, &parsed_example)) { 392db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return errors::InvalidArgument("Could not parse example input, value: '", 393db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower serialized_example, "'"); 394db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 395db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower constexpr size_t kMax = std::numeric_limits<size_t>::max(); 396db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::vector<size_t> sparse_features_found(config.sparse.size(), kMax); 397db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::vector<size_t> dense_features_found(config.dense.size(), kMax); 398db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 399db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Handle features present in the example. 400db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (parsed::FeatureMapEntry& name_and_feature : parsed_example) { 401db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower parsed::Feature& feature = name_and_feature.second; 402db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::pair<size_t, Type> d_and_type; 403db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower uint64 h = hasher(name_and_feature.first); 404db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!config_index.Find(h, &d_and_type)) continue; 405db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower size_t d = d_and_type.first; 406db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 407db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto parse_error = [&](StringPiece feature_name) { 408db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return errors::InvalidArgument("Name: ", example_name, ", Key: ", 409db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower feature_name, ", Index: ", example_index, 410db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower ". Can't parse serialized Example."); 411db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower }; 412db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 413db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (d_and_type.second == Type::Dense) { 414db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DataType example_dtype; 415db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower TF_RETURN_IF_ERROR(feature.ParseDataType(&example_dtype)); 416db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (example_dtype == DT_INVALID) continue; 417db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 418db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower dense_features_found[d] = example_index; 419db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (example_dtype != config.dense[d].dtype) { 420db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return errors::InvalidArgument( 421db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower "Name: ", example_name, ", Feature: ", config.dense[d].feature_name, 422db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower ". Data types don't match. ", "Data type: ", 423db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DataTypeString(example_dtype), "Expected type: ", 424db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DataTypeString(config.dense[d].dtype)); 425db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 426db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const string& feature_name = config.dense[d].feature_name; 427db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const TensorShape& shape = config.dense[d].shape; 428db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower Tensor& out = (*output_dense)[d]; 429db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 430db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const std::size_t num_elements = shape.num_elements(); 431db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const std::size_t offset = example_index * num_elements; 432db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 433db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto shape_error = [&](size_t size, StringPiece type_str) { 434db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return errors::InvalidArgument( 435db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower "Name: ", example_name, ", Key: ", feature_name, ", Index: ", 436db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower example_index, ". Number of ", type_str, 437db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower " values != expected. " 438db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower "Values size: ", 439db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower size, " but output shape: ", shape.DebugString()); 440db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower }; 441db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 442db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower switch (config.dense[d].dtype) { 443db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_INT64: { 4447705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower SmallVector<int64> list; 445db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!feature.ParseInt64List(&list)) return parse_error(feature_name); 446db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (list.size() != num_elements) { 447db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return shape_error(list.size(), "int64"); 448db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 449db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto out_p = out.flat<int64>().data() + offset; 450db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::copy_n(list.begin(), list.size(), out_p); 451db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 452db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 453db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_FLOAT: { 4547705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower SmallVector<float> list; 455db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!feature.ParseFloatList(&list)) return parse_error(feature_name); 456db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (list.size() != num_elements) { 457db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return shape_error(list.size(), "float"); 458db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 459db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto out_p = out.flat<float>().data() + offset; 460db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::copy_n(list.begin(), list.size(), out_p); 461db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 462db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 463db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_STRING: { 4647705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower SmallVector<string> list; 465db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!feature.ParseBytesList(&list)) return parse_error(feature_name); 466db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (list.size() != num_elements) { 467db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return shape_error(list.size(), "bytes"); 468db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 469db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto out_p = out.flat<string>().data() + offset; 470db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t i = 0; i < list.size(); ++i) { 471db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower out_p[i] = std::move(list[i]); 472db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 473db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 474db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 475db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower default: 476db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower CHECK(false) << "Should not happen."; 477db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 478db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } else { 479db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Handle sparse features. 480db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower sparse_features_found[d] = example_index; 481db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const string& feature_name = config.sparse[d].feature_name; 482db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower SparseBuffer& out = (*output_sparse)[d]; 483db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DataType example_dtype; 484db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower TF_RETURN_IF_ERROR(feature.ParseDataType(&example_dtype)); 485db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (example_dtype != DT_INVALID && 486db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower example_dtype != config.sparse[d].dtype) { 487db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return errors::InvalidArgument( 488db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower "Name: ", example_name, ", Feature: ", 489db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower config.sparse[d].feature_name, ". Data types don't match. ", 490db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower "Expected type: ", DataTypeString(config.sparse[d].dtype)); 491db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 492db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 493db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower switch (config.sparse[d].dtype) { 494db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_INT64: { 495db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (example_dtype != DT_INVALID) { 496db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!feature.ParseInt64List(&out.int64_list)) { 497db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return parse_error(feature_name); 498db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 499db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 500db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower out.example_end_indices.push_back(out.int64_list.size()); 501db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 502db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 503db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_FLOAT: { 504db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (example_dtype != DT_INVALID) { 505db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!feature.ParseFloatList(&out.float_list)) { 506db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return parse_error(feature_name); 507db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 508db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 509db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower out.example_end_indices.push_back(out.float_list.size()); 510db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 511db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 512db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_STRING: { 513db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (example_dtype != DT_INVALID) { 514db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!feature.ParseBytesList(&out.bytes_list)) { 515db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return parse_error(feature_name); 516db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 517db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 518db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower out.example_end_indices.push_back(out.bytes_list.size()); 519db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 520db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 521db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower default: 522db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower CHECK(false) << "Should not happen."; 523db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 524db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 525db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 526db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 527db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Handle missing dense features. 528db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t d = 0; d < config.dense.size(); ++d) { 529db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (dense_features_found[d] == example_index) continue; 530db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (config.dense[d].default_value.NumElements() == 0) { 531db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return errors::InvalidArgument("Name: ", example_name, ", Feature: ", 532db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower config.dense[d].feature_name, 533db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower " is required but could not be found."); 534db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 535db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 536db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const Tensor& in = config.dense[d].default_value; 537db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower Tensor& out = (*output_dense)[d]; 538db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const std::size_t num_elements = in.shape().num_elements(); 539db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const std::size_t offset = example_index * num_elements; 540db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 541db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower switch (config.dense[d].dtype) { 542db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_INT64: { 543db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::copy_n(in.flat<int64>().data(), num_elements, 544db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower out.flat<int64>().data() + offset); 545db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 546db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 547db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_FLOAT: { 548db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::copy_n(in.flat<float>().data(), num_elements, 549db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower out.flat<float>().data() + offset); 550db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 551db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 552db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_STRING: { 553db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::copy_n(in.flat<string>().data(), num_elements, 554db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower out.flat<string>().data() + offset); 555db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 556db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 557db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower default: 558db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower CHECK(false) << "Should not happen."; 559db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 560db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 561db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 562db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Handle missing sparse features. 563db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t d = 0; d < config.sparse.size(); ++d) { 564db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (sparse_features_found[d] == example_index) continue; 565db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower SparseBuffer& out = (*output_sparse)[d]; 566db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower size_t prev_example_end_index = 567db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower out.example_end_indices.empty() ? 0 : out.example_end_indices.back(); 568db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower out.example_end_indices.push_back(prev_example_end_index); 569db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 570db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 571db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return Status::OK(); 572db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} 573db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 574db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerStatus CheckConfigDataType(DataType dtype) { 575db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower switch (dtype) { 576db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_INT64: 577db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_FLOAT: 578db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_STRING: 579db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return Status::OK(); 580db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower default: 581db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return errors::InvalidArgument("Invalid config dtype: ", 582db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DataTypeString(dtype)); 583db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 584db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} 585db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 586db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} // namespace 587db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 588db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlowerStatus FastParseExample(const Config& config, 589db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower gtl::ArraySlice<string> serialized, 590db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower gtl::ArraySlice<string> example_names, 591db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower thread::ThreadPool* thread_pool, Result* result) { 592db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower DCHECK(result != nullptr); 593db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Check config so we can safely CHECK(false) in switches on config.*.dtype 594db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (auto& c : config.sparse) { 595db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype)); 596db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 597db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (auto& c : config.dense) { 598db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype)); 599db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 600db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 601db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower size_t config_size = config.dense.size() + config.sparse.size(); 602db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower SeededHasher hasher; 603db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Build config index. 604db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower PresizedCuckooMap<std::pair<size_t, Type>> config_index(config_size); 605db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower bool ok = true; 606db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t i = 0; i < 1000; ++i) { 607db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t d = 0; d < config.dense.size(); ++d) { 608db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower ok &= config_index.InsertUnique(hasher(config.dense[d].feature_name), 609db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower {d, Type::Dense}); 610db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 611db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t d = 0; d < config.sparse.size(); ++d) { 612db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower ok &= config_index.InsertUnique(hasher(config.sparse[d].feature_name), 613db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower {d, Type::Sparse}); 614db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 615db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (ok) break; 616db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower LOG(WARNING) << "Collision found. This should happen only if you have " 617db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower "around 2^32 entries in your config."; 618db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower hasher.seed++; 619db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower config_index.Clear(config_size); 620db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 621db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!ok) { 622db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return errors::Internal( 623db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower "Could not avoid collision. This should not happen."); 624db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 625db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 626db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Allocate dense output (sparse have to be buffered). 627db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t d = 0; d < config.dense.size(); ++d) { 628db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower TensorShape out_shape; 629db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower out_shape.AddDim(serialized.size()); 630db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (const int64 dim : config.dense[d].shape.dim_sizes()) { 631db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower out_shape.AddDim(dim); 632db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 633db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower result->dense_values.emplace_back(config.dense[d].dtype, out_shape); 634db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 635db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 636db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // This parameter affects performance in a big and data-dependent way. 6377705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower const size_t kMiniBatchSizeBytes = 50000; 638db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 6397705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower // Calculate number of minibatches. 6407705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower // In main regime make each minibatch around kMiniBatchSizeBytes bytes. 6417705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower // Apply 'special logic' below for small and big regimes. 6427705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower const size_t num_minibatches = [&] { 6437705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower size_t result = 0; 644db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower size_t minibatch_bytes = 0; 645db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t i = 0; i < serialized.size(); i++) { 646db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (minibatch_bytes == 0) { // start minibatch 6477705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower result++; 648db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 649db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower minibatch_bytes += serialized[i].size() + 1; 650db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (minibatch_bytes > kMiniBatchSizeBytes) { 651db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower minibatch_bytes = 0; 652db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 653db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 6547705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower // 'special logic' 6557705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower const size_t min_minibatches = std::min<size_t>(8, serialized.size()); 6567705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower const size_t max_minibatches = 64; 6577705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower return std::max<size_t>(min_minibatches, 6587705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower std::min<size_t>(max_minibatches, result)); 659db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower }(); 660db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 6617705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower auto first_example_of_minibatch = [&](size_t minibatch) -> size_t { 6627705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower return (serialized.size() * minibatch) / num_minibatches; 6637705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower }; 6647705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower 6657705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower // TODO(lew): A big performance low-hanging fruit here is to improve 6667705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower // num_minibatches calculation to take into account actual amount of work 6677705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower // needed, as the size in bytes is not perfect. Linear combination of 6687705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower // size in bytes and average number of features per example is promising. 6697705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower // Even better: measure time instead of estimating, but this is too costly 6707705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower // in small batches. 6717705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower // Maybe accept outside parameter #num_minibatches? 672db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 673db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Do minibatches in parallel. 674db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::vector<std::vector<SparseBuffer>> sparse_buffers(num_minibatches); 675db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::vector<Status> status_of_minibatch(num_minibatches); 676db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto ProcessMiniBatch = [&](size_t minibatch) { 677db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower sparse_buffers[minibatch].resize(config.sparse.size()); 6787705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower size_t start = first_example_of_minibatch(minibatch); 6797705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower size_t end = first_example_of_minibatch(minibatch + 1); 680db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t e = start; e < end; ++e) { 681db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower status_of_minibatch[minibatch] = FastParseSerializedExample( 682db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower serialized[e], 683db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower (example_names.size() > 0 ? example_names[e] : "<unknown>"), e, 684db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower config, config_index, hasher, &result->dense_values, 685db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower &sparse_buffers[minibatch]); 686db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower if (!status_of_minibatch[minibatch].ok()) break; 687db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 688db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower }; 689db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 690db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower ParallelFor(ProcessMiniBatch, num_minibatches, thread_pool); 691db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 692db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (Status& status : status_of_minibatch) { 693db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower TF_RETURN_IF_ERROR(status); 694db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 695db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 696db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Merge SparseBuffers from all minibatches for every config.sparse. 697db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto MergeMinibatches = [&](size_t d) { 698db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Loop over minibatches 699db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower size_t total_num_features = 0; 700db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower size_t max_num_features = 0; 701db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (auto& sparse_values_tmp : sparse_buffers) { 702db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::vector<size_t>& end_indices = 703db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower sparse_values_tmp[d].example_end_indices; 704db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower total_num_features += end_indices.back(); 705db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower max_num_features = std::max(max_num_features, end_indices[0]); 706db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t i = 1; i < end_indices.size(); ++i) { 707db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower size_t example_size = end_indices[i] - end_indices[i - 1]; 708db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower max_num_features = std::max(max_num_features, example_size); 709db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 710db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 711db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 712db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower TensorShape indices_shape; 713db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower indices_shape.AddDim(total_num_features); 714db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower indices_shape.AddDim(2); 715db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower result->sparse_indices.emplace_back(DT_INT64, indices_shape); 716db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower Tensor* indices = &result->sparse_indices.back(); 717db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 718db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower TensorShape values_shape; 719db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower values_shape.AddDim(total_num_features); 720db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower result->sparse_values.emplace_back(config.sparse[d].dtype, values_shape); 721db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower Tensor* values = &result->sparse_values.back(); 722db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 723db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower result->sparse_shapes.emplace_back(DT_INT64, TensorShape({2})); 724db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower auto shapes_shape_t = result->sparse_shapes.back().vec<int64>(); 725db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower shapes_shape_t(0) = serialized.size(); 726db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower shapes_shape_t(1) = max_num_features; 727db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 728db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower size_t offset = 0; 729db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t i = 0; i < sparse_buffers.size(); ++i) { 730db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower const SparseBuffer& buffer = sparse_buffers[i][d]; 731db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 732db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Update indices. 733db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower int64* ix_p = &indices->matrix<int64>()(offset, 0); 734db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower size_t delta = 0; 7357705791619f5e851687e9a63b4315087e189f8beA. Unique TensorFlower size_t example_index = first_example_of_minibatch(i); 736db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t example_end_index : buffer.example_end_indices) { 737db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower size_t feature_index = 0; 738db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (; delta < example_end_index; ++delta) { 739db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Column 0: example index 740db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower *ix_p = example_index; 741db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Column 1: the feature index buffer example 742db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower *(ix_p + 1) = feature_index; 743db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower ix_p += 2; 744db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower ++feature_index; 745db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 746db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower ++example_index; 747db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 748db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 749db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower // Copy values over. 750db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower switch (config.sparse[d].dtype) { 751db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_INT64: { 752db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::copy(buffer.int64_list.begin(), buffer.int64_list.end(), 753db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower values->flat<int64>().data() + offset); 754db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 755db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 756db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_FLOAT: { 757db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::copy(buffer.float_list.begin(), buffer.float_list.end(), 758db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower values->flat<float>().data() + offset); 759db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 760db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 761db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower case DT_STRING: { 762db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower std::move(buffer.bytes_list.begin(), buffer.bytes_list.end(), 763db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower values->flat<string>().data() + offset); 764db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower break; 765db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 766db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower default: 767db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower CHECK(false) << "Should not happen."; 768db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 769db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 770db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower offset += delta; 771db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 772db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower }; 773db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 774db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower for (size_t d = 0; d < config.sparse.size(); ++d) { 775db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower MergeMinibatches(d); 776db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower } 777db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 778db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower return Status::OK(); 779db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} 780db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower 781db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} // namespace example 782db7bdab6e586e02051556d9f36a7887500378cf9A. Unique TensorFlower} // namespace tensorflow 783