1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
17 #define TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
18 
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "tensorflow/core/example/example.pb.h"
24 #include "tensorflow/core/framework/allocator.h"
25 #include "tensorflow/core/framework/graph.pb.h"
26 #include "tensorflow/core/framework/op_kernel.h"
27 #include "tensorflow/core/framework/partial_tensor_shape.h"
28 #include "tensorflow/core/framework/tensor.h"
29 #include "tensorflow/core/framework/types.h"
30 #include "tensorflow/core/lib/gtl/array_slice.h"
31 #include "tensorflow/core/platform/types.h"
32 #include "tensorflow/core/util/sparse/sparse_tensor.h"
33 
34 namespace tensorflow {
35 namespace example {
36 
37 // FastParseExampleConfig defines how to parse features in Example.
38 // Each sub-config is responsible for one feature identified with feature_name.
39 // FastParseExampleConfig can't have two sub-configs with the same feature_name.
40 // dtype identifies the type of output vector and the kind of Feature expected
41 // in Example.
42 struct FastParseExampleConfig {
43   struct Dense {
DenseFastParseExampleConfig::Dense44     Dense(StringPiece feature_name, DataType dtype, PartialTensorShape shape,
45           Tensor default_value, bool variable_length,
46           std::size_t elements_per_stride)
47         : feature_name(feature_name),  // TODO(mrry): Switch to preallocated
48                                        // tstring when this is available.
49           dtype(dtype),
50           shape(std::move(shape)),
51           default_value(std::move(default_value)),
52           variable_length(variable_length),
53           elements_per_stride(elements_per_stride) {}
54     Dense() = default;
55 
56     tstring feature_name;
57     DataType dtype;
58     // These 2 fields correspond exactly to dense_shapes and dense_defaults in
59     // ParseExample op.
60     // Documentation is available in: tensorflow/core/ops/parsing_ops.cc
61     PartialTensorShape shape;
62     Tensor default_value;
63     bool variable_length;
64     std::size_t elements_per_stride;
65   };
66 
67   struct Sparse {
SparseFastParseExampleConfig::Sparse68     Sparse(StringPiece feature_name, DataType dtype)
69         : feature_name(feature_name),  // TODO(mrry): Switch to preallocated
70                                        // tstring when this is available.
71           dtype(dtype) {}
72     Sparse() = default;
73 
74     tstring feature_name;
75     DataType dtype;
76   };
77 
78   struct Ragged {
RaggedFastParseExampleConfig::Ragged79     Ragged(StringPiece feature_name, DataType dtype, DataType splits_dtype)
80         : feature_name(feature_name),  // TODO(mrry): Switch to preallocated
81                                        // tstring when this is available.
82           dtype(dtype),
83           splits_dtype(splits_dtype) {}
84     Ragged() = default;
85 
86     tstring feature_name;
87     DataType dtype;
88     DataType splits_dtype;
89   };
90 
91   std::vector<Dense> dense;
92   std::vector<Sparse> sparse;
93   std::vector<Ragged> ragged;
94 
95   // If `true`, `Result::feature_stats` will contain one
96   // `PerExampleFeatureStats` for each serialized example in the input.
97   bool collect_feature_stats = false;
98 };
99 
100 // Statistics about the features in each example passed to
101 // `FastParse[Single]Example()`.
102 //
103 // TODO(b/111553342): The gathered statistics currently have two limitations:
104 // * Feature names that appear more than once will be counted multiple times.
105 // * The feature values count only represents the counts for features that were
106 //   requested in the `FastParseExampleConfig`.
107 // These could be addressed with additional work at runtime.
108 struct PerExampleFeatureStats {
109   // The number of feature names in an example.
110   size_t features_count = 0;
111 
112   // The sum of the number of values in each feature that is parsed.
113   size_t feature_values_count = 0;
114 };
115 
116 // This is exactly the output of TF's ParseExample Op.
117 // Documentation is available in: tensorflow/core/ops/parsing_ops.cc
118 struct Result {
119   std::vector<Tensor> sparse_indices;
120   std::vector<Tensor> sparse_values;
121   std::vector<Tensor> sparse_shapes;
122   std::vector<Tensor> dense_values;
123   std::vector<Tensor> ragged_values;
124   std::vector<Tensor> ragged_splits;
125   std::vector<Tensor> ragged_outer_splits;  // For SequenceExamples
126 
127   // This vector will be populated with one element per example if
128   // `FastParseExampleConfig::collect_feature_stats` is set to `true`.
129   std::vector<PerExampleFeatureStats> feature_stats;
130 };
131 
132 // Parses a batch of serialized Example protos and converts them into result
133 // according to given config.
134 // Given example names have to either be empty or the same size as serialized.
135 // example_names are used only for error messages.
136 Status FastParseExample(const FastParseExampleConfig& config,
137                         gtl::ArraySlice<tstring> serialized,
138                         gtl::ArraySlice<tstring> example_names,
139                         thread::ThreadPool* thread_pool, Result* result);
140 
141 // TODO(mrry): Move the hash table construction into the config object.
142 typedef FastParseExampleConfig FastParseSingleExampleConfig;
143 
144 Status FastParseSingleExample(const FastParseSingleExampleConfig& config,
145                               StringPiece serialized, Result* result);
146 
147 // Parses a batch of serialized SequenceExample protos and converts them into
148 // result according to given config.
149 // Given example names have to either be empty or the same size as serialized.
150 // example_names are used only for error messages.
151 // (If batch=true, then this parses a single SequenceExample.)
152 Status FastParseSequenceExample(
153     const example::FastParseExampleConfig& context_config,
154     const example::FastParseExampleConfig& feature_list_config,
155     gtl::ArraySlice<tstring> serialized, gtl::ArraySlice<tstring> example_names,
156     thread::ThreadPool* thread_pool, example::Result* context_result,
157     example::Result* feature_list_result,
158     std::vector<Tensor>* dense_feature_lengths, bool is_batch = true);
159 
160 // This function parses serialized Example and populates given example.
161 // It uses the same specialized parser as FastParseExample which is efficient.
162 // But then constructs Example which is relatively slow.
163 // It is exported here as a convenient API to test parser part separately.
164 bool TestFastParse(const string& serialized, Example* example);
165 
166 }  // namespace example
167 }  // namespace tensorflow
168 
169 #endif  // TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
170