1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15 16"""Parsing Ops.""" 17from __future__ import absolute_import 18from __future__ import division 19from __future__ import print_function 20 21from tensorflow.python.framework import ops 22from tensorflow.python.framework import sparse_tensor 23from tensorflow.python.ops import array_ops 24from tensorflow.python.ops import control_flow_ops 25from tensorflow.python.ops import gen_parsing_ops 26from tensorflow.python.ops import math_ops 27from tensorflow.python.ops import parsing_config 28# go/tf-wildcard-import 29# pylint: disable=wildcard-import,undefined-variable 30from tensorflow.python.ops.gen_parsing_ops import * 31# pylint: enable=wildcard-import,undefined-variable 32from tensorflow.python.util import deprecation 33from tensorflow.python.util import dispatch 34from tensorflow.python.util.tf_export import tf_export 35 36 37ops.NotDifferentiable("DecodeRaw") 38ops.NotDifferentiable("DecodePaddedRaw") 39ops.NotDifferentiable("ParseTensor") 40ops.NotDifferentiable("SerializeTensor") 41ops.NotDifferentiable("StringToNumber") 42 43 44VarLenFeature = parsing_config.VarLenFeature 45RaggedFeature = parsing_config.RaggedFeature 46SparseFeature = parsing_config.SparseFeature 47FixedLenFeature = parsing_config.FixedLenFeature 48FixedLenSequenceFeature = parsing_config.FixedLenSequenceFeature 49# pylint: disable=protected-access 50_ParseOpParams = parsing_config._ParseOpParams 51_construct_tensors_for_composite_features = ( 52 parsing_config._construct_tensors_for_composite_features) 53# pylint: enable=protected-access 54 55 56# TODO(b/122887740) Switch files that use this private symbol to use new name. 57_construct_sparse_tensors_for_sparse_features = \ 58 _construct_tensors_for_composite_features 59 60 61def _prepend_none_dimension(features): 62 """Returns a copy of features with adjusted FixedLenSequenceFeature shapes.""" 63 if features: 64 modified_features = dict(features) # Create a copy to modify 65 for key, feature in features.items(): 66 if isinstance(feature, FixedLenSequenceFeature): 67 if not feature.allow_missing: 68 raise ValueError("Unsupported: FixedLenSequenceFeature requires " 69 "allow_missing to be True.") 70 modified_features[key] = FixedLenSequenceFeature( 71 [None] + list(feature.shape), 72 feature.dtype, 73 feature.allow_missing, 74 feature.default_value) 75 return modified_features 76 else: 77 return features 78 79 80@tf_export("io.parse_example", v1=[]) 81@dispatch.add_dispatch_support 82def parse_example_v2(serialized, features, example_names=None, name=None): 83 # pylint: disable=line-too-long 84 """Parses `Example` protos into a `dict` of tensors. 85 86 Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) 87 protos given in `serialized`. We refer to `serialized` as a batch with 88 `batch_size` many entries of individual `Example` protos. 89 90 `example_names` may contain descriptive names for the corresponding serialized 91 protos. These may be useful for debugging purposes, but they have no effect on 92 the output. If not `None`, `example_names` must be the same length as 93 `serialized`. 94 95 This op parses serialized examples into a dictionary mapping keys to `Tensor` 96 `SparseTensor`, and `RaggedTensor` objects. `features` is a dict from keys to 97 `VarLenFeature`, `SparseFeature`, `RaggedFeature`, and `FixedLenFeature` 98 objects. Each `VarLenFeature` and `SparseFeature` is mapped to a 99 `SparseTensor`; each `FixedLenFeature` is mapped to a `Tensor`; and each 100 `RaggedFeature` is mapped to a `RaggedTensor`. 101 102 Each `VarLenFeature` maps to a `SparseTensor` of the specified type 103 representing a ragged matrix. Its indices are `[batch, index]` where `batch` 104 identifies the example in `serialized`, and `index` is the value's index in 105 the list of values associated with that feature and example. 106 107 Each `SparseFeature` maps to a `SparseTensor` of the specified type 108 representing a Tensor of `dense_shape` `[batch_size] + SparseFeature.size`. 109 Its `values` come from the feature in the examples with key `value_key`. 110 A `values[i]` comes from a position `k` in the feature of an example at batch 111 entry `batch`. This positional information is recorded in `indices[i]` as 112 `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of 113 the feature in the example at with key `SparseFeature.index_key[j]`. 114 In other words, we split the indices (except the first index indicating the 115 batch entry) of a `SparseTensor` by dimension into different features of the 116 `Example`. Due to its complexity a `VarLenFeature` should be preferred over a 117 `SparseFeature` whenever possible. 118 119 Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or 120 `tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`. 121 122 `FixedLenFeature` entries with a `default_value` are optional. With no default 123 value, we will fail if that `Feature` is missing from any example in 124 `serialized`. 125 126 Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type 127 (or `tf.float32` if not specified) and shape 128 `(serialized.size(), None) + df.shape`. 129 All examples in `serialized` will be padded with `default_value` along the 130 second dimension. 131 132 Each `RaggedFeature` maps to a `RaggedTensor` of the specified type. It 133 is formed by stacking the `RaggedTensor` for each example, where the 134 `RaggedTensor` for each individual example is constructed using the tensors 135 specified by `RaggedTensor.values_key` and `RaggedTensor.partition`. See 136 the `tf.io.RaggedFeature` documentation for details and examples. 137 138 Examples: 139 140 For example, if one expects a `tf.float32` `VarLenFeature` `ft` and three 141 serialized `Example`s are provided: 142 143 ``` 144 serialized = [ 145 features 146 { feature { key: "ft" value { float_list { value: [1.0, 2.0] } } } }, 147 features 148 { feature []}, 149 features 150 { feature { key: "ft" value { float_list { value: [3.0] } } } 151 ] 152 ``` 153 154 then the output will look like: 155 156 ```python 157 {"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]], 158 values=[1.0, 2.0, 3.0], 159 dense_shape=(3, 2)) } 160 ``` 161 162 If instead a `FixedLenSequenceFeature` with `default_value = -1.0` and 163 `shape=[]` is used then the output will look like: 164 165 ```python 166 {"ft": [[1.0, 2.0], [3.0, -1.0]]} 167 ``` 168 169 Given two `Example` input protos in `serialized`: 170 171 ``` 172 [ 173 features { 174 feature { key: "kw" value { bytes_list { value: [ "knit", "big" ] } } } 175 feature { key: "gps" value { float_list { value: [] } } } 176 }, 177 features { 178 feature { key: "kw" value { bytes_list { value: [ "emmy" ] } } } 179 feature { key: "dank" value { int64_list { value: [ 42 ] } } } 180 feature { key: "gps" value { } } 181 } 182 ] 183 ``` 184 185 And arguments 186 187 ``` 188 example_names: ["input0", "input1"], 189 features: { 190 "kw": VarLenFeature(tf.string), 191 "dank": VarLenFeature(tf.int64), 192 "gps": VarLenFeature(tf.float32), 193 } 194 ``` 195 196 Then the output is a dictionary: 197 198 ```python 199 { 200 "kw": SparseTensor( 201 indices=[[0, 0], [0, 1], [1, 0]], 202 values=["knit", "big", "emmy"] 203 dense_shape=[2, 2]), 204 "dank": SparseTensor( 205 indices=[[1, 0]], 206 values=[42], 207 dense_shape=[2, 1]), 208 "gps": SparseTensor( 209 indices=[], 210 values=[], 211 dense_shape=[2, 0]), 212 } 213 ``` 214 215 For dense results in two serialized `Example`s: 216 217 ``` 218 [ 219 features { 220 feature { key: "age" value { int64_list { value: [ 0 ] } } } 221 feature { key: "gender" value { bytes_list { value: [ "f" ] } } } 222 }, 223 features { 224 feature { key: "age" value { int64_list { value: [] } } } 225 feature { key: "gender" value { bytes_list { value: [ "f" ] } } } 226 } 227 ] 228 ``` 229 230 We can use arguments: 231 232 ``` 233 example_names: ["input0", "input1"], 234 features: { 235 "age": FixedLenFeature([], dtype=tf.int64, default_value=-1), 236 "gender": FixedLenFeature([], dtype=tf.string), 237 } 238 ``` 239 240 And the expected output is: 241 242 ```python 243 { 244 "age": [[0], [-1]], 245 "gender": [["f"], ["f"]], 246 } 247 ``` 248 249 An alternative to `VarLenFeature` to obtain a `SparseTensor` is 250 `SparseFeature`. For example, given two `Example` input protos in 251 `serialized`: 252 253 ``` 254 [ 255 features { 256 feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } } 257 feature { key: "ix" value { int64_list { value: [ 3, 20 ] } } } 258 }, 259 features { 260 feature { key: "val" value { float_list { value: [ 0.0 ] } } } 261 feature { key: "ix" value { int64_list { value: [ 42 ] } } } 262 } 263 ] 264 ``` 265 266 And arguments 267 268 ``` 269 example_names: ["input0", "input1"], 270 features: { 271 "sparse": SparseFeature( 272 index_key="ix", value_key="val", dtype=tf.float32, size=100), 273 } 274 ``` 275 276 Then the output is a dictionary: 277 278 ```python 279 { 280 "sparse": SparseTensor( 281 indices=[[0, 3], [0, 20], [1, 42]], 282 values=[0.5, -1.0, 0.0] 283 dense_shape=[2, 100]), 284 } 285 ``` 286 287 See the `tf.io.RaggedFeature` documentation for examples showing how 288 `RaggedFeature` can be used to obtain `RaggedTensor`s. 289 290 Args: 291 serialized: A vector (1-D Tensor) of strings, a batch of binary 292 serialized `Example` protos. 293 features: A `dict` mapping feature keys to `FixedLenFeature`, 294 `VarLenFeature`, `SparseFeature`, and `RaggedFeature` values. 295 example_names: A vector (1-D Tensor) of strings (optional), the names of 296 the serialized protos in the batch. 297 name: A name for this operation (optional). 298 299 Returns: 300 A `dict` mapping feature keys to `Tensor`, `SparseTensor`, and 301 `RaggedTensor` values. 302 303 Raises: 304 ValueError: if any feature is invalid. 305 """ 306 if not features: 307 raise ValueError("Missing: features was %s." % features) 308 features = _prepend_none_dimension(features) 309 params = _ParseOpParams.from_features(features, [ 310 VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature, 311 RaggedFeature 312 ]) 313 314 outputs = _parse_example_raw(serialized, example_names, params, name=name) 315 return _construct_tensors_for_composite_features(features, outputs) 316 317 318@tf_export(v1=["io.parse_example", "parse_example"]) 319@dispatch.add_dispatch_support 320def parse_example(serialized, features, name=None, example_names=None): 321 return parse_example_v2(serialized, features, example_names, name) 322 323 324parse_example.__doc__ = parse_example_v2.__doc__ 325 326 327def _parse_example_raw(serialized, names, params, name): 328 """Parses `Example` protos. 329 330 Args: 331 serialized: A vector (1-D Tensor) of strings, a batch of binary 332 serialized `Example` protos. 333 names: A vector (1-D Tensor) of strings (optional), the names of 334 the serialized protos. 335 params: A `ParseOpParams` containing the parameters for the parse op. 336 name: A name for this operation (optional). 337 338 Returns: 339 A `dict` mapping keys to `Tensor`s and `SparseTensor`s and `RaggedTensor`s. 340 341 """ 342 if params.num_features == 0: 343 raise ValueError("Must provide at least one feature key") 344 with ops.name_scope(name, "ParseExample", [serialized, names]): 345 names = [] if names is None else names 346 serialized = ops.convert_to_tensor(serialized, name="serialized") 347 if params.ragged_keys and serialized.shape.ndims is None: 348 raise ValueError("serialized must have statically-known rank to " 349 "parse ragged features.") 350 outputs = gen_parsing_ops.parse_example_v2( 351 serialized=serialized, 352 names=names, 353 sparse_keys=params.sparse_keys, 354 dense_keys=params.dense_keys, 355 ragged_keys=params.ragged_keys, 356 dense_defaults=params.dense_defaults_vec, 357 num_sparse=len(params.sparse_keys), 358 sparse_types=params.sparse_types, 359 ragged_value_types=params.ragged_value_types, 360 ragged_split_types=params.ragged_split_types, 361 dense_shapes=params.dense_shapes_as_proto, 362 name=name) 363 (sparse_indices, sparse_values, sparse_shapes, dense_values, 364 ragged_values, ragged_row_splits) = outputs 365 # pylint: disable=protected-access 366 ragged_tensors = parsing_config._build_ragged_tensors( 367 serialized.shape, ragged_values, ragged_row_splits) 368 369 sparse_tensors = [ 370 sparse_tensor.SparseTensor(ix, val, shape) for (ix, val, shape) 371 in zip(sparse_indices, sparse_values, sparse_shapes)] 372 373 return dict( 374 zip(params.sparse_keys + params.dense_keys + params.ragged_keys, 375 sparse_tensors + dense_values + ragged_tensors)) 376 377 378@tf_export(v1=["io.parse_single_example", "parse_single_example"]) 379@dispatch.add_dispatch_support 380def parse_single_example(serialized, features, name=None, example_names=None): 381 """Parses a single `Example` proto. 382 383 Similar to `parse_example`, except: 384 385 For dense tensors, the returned `Tensor` is identical to the output of 386 `parse_example`, except there is no batch dimension, the output shape is the 387 same as the shape given in `dense_shape`. 388 389 For `SparseTensor`s, the first (batch) column of the indices matrix is removed 390 (the indices matrix is a column vector), the values vector is unchanged, and 391 the first (`batch_size`) entry of the shape vector is removed (it is now a 392 single element vector). 393 394 One might see performance advantages by batching `Example` protos with 395 `parse_example` instead of using this function directly. 396 397 Args: 398 serialized: A scalar string Tensor, a single serialized Example. 399 features: A `dict` mapping feature keys to `FixedLenFeature` or 400 `VarLenFeature` values. 401 name: A name for this operation (optional). 402 example_names: (Optional) A scalar string Tensor, the associated name. 403 404 Returns: 405 A `dict` mapping feature keys to `Tensor` and `SparseTensor` values. 406 407 Raises: 408 ValueError: if any feature is invalid. 409 """ 410 return parse_single_example_v2(serialized, features, example_names, name) 411 412 413@tf_export("io.parse_single_example", v1=[]) 414@dispatch.add_dispatch_support 415def parse_single_example_v2( 416 serialized, features, example_names=None, name=None 417 ): 418 """Parses a single `Example` proto. 419 420 Similar to `parse_example`, except: 421 422 For dense tensors, the returned `Tensor` is identical to the output of 423 `parse_example`, except there is no batch dimension, the output shape is the 424 same as the shape given in `dense_shape`. 425 426 For `SparseTensor`s, the first (batch) column of the indices matrix is removed 427 (the indices matrix is a column vector), the values vector is unchanged, and 428 the first (`batch_size`) entry of the shape vector is removed (it is now a 429 single element vector). 430 431 One might see performance advantages by batching `Example` protos with 432 `parse_example` instead of using this function directly. 433 434 Args: 435 serialized: A scalar string Tensor, a single serialized Example. 436 features: A `dict` mapping feature keys to `FixedLenFeature` or 437 `VarLenFeature` values. 438 example_names: (Optional) A scalar string Tensor, the associated name. 439 name: A name for this operation (optional). 440 441 Returns: 442 A `dict` mapping feature keys to `Tensor` and `SparseTensor` values. 443 444 Raises: 445 ValueError: if any feature is invalid. 446 """ 447 if not features: 448 raise ValueError("Missing features.") 449 with ops.name_scope(name, "ParseSingleExample", [serialized, example_names]): 450 serialized = ops.convert_to_tensor(serialized, name="serialized") 451 serialized = _assert_scalar(serialized, "serialized") 452 return parse_example_v2(serialized, features, example_names, name) 453 454 455@tf_export("io.parse_sequence_example") 456@dispatch.add_dispatch_support 457def parse_sequence_example(serialized, 458 context_features=None, 459 sequence_features=None, 460 example_names=None, 461 name=None): 462 # pylint: disable=line-too-long 463 """Parses a batch of `SequenceExample` protos. 464 465 Parses a vector of serialized 466 [`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) 467 protos given in `serialized`. 468 469 This op parses serialized sequence examples into a tuple of dictionaries, 470 each mapping keys to `Tensor` and `SparseTensor` objects. 471 The first dictionary contains mappings for keys appearing in 472 `context_features`, and the second dictionary contains mappings for keys 473 appearing in `sequence_features`. 474 475 At least one of `context_features` and `sequence_features` must be provided 476 and non-empty. 477 478 The `context_features` keys are associated with a `SequenceExample` as a 479 whole, independent of time / frame. In contrast, the `sequence_features` keys 480 provide a way to access variable-length data within the `FeatureList` section 481 of the `SequenceExample` proto. While the shapes of `context_features` values 482 are fixed with respect to frame, the frame dimension (the first dimension) 483 of `sequence_features` values may vary between `SequenceExample` protos, 484 and even between `feature_list` keys within the same `SequenceExample`. 485 486 `context_features` contains `VarLenFeature`, `RaggedFeature`, and 487 `FixedLenFeature` objects. Each `VarLenFeature` is mapped to a 488 `SparseTensor`; each `RaggedFeature` is mapped to a `RaggedTensor`; and each 489 `FixedLenFeature` is mapped to a `Tensor`, of the specified type, shape, and 490 default value. 491 492 `sequence_features` contains `VarLenFeature`, `RaggedFeature`, and 493 `FixedLenSequenceFeature` objects. Each `VarLenFeature` is mapped to a 494 `SparseTensor`; each `RaggedFeature` is mapped to a `RaggedTensor; and 495 each `FixedLenSequenceFeature` is mapped to a `Tensor`, each of the specified 496 type. The shape will be `(B,T,) + df.dense_shape` for 497 `FixedLenSequenceFeature` `df`, where `B` is the batch size, and `T` is the 498 length of the associated `FeatureList` in the `SequenceExample`. For instance, 499 `FixedLenSequenceFeature([])` yields a scalar 2-D `Tensor` of static shape 500 `[None, None]` and dynamic shape `[B, T]`, while 501 `FixedLenSequenceFeature([k])` (for `int k >= 1`) yields a 3-D matrix `Tensor` 502 of static shape `[None, None, k]` and dynamic shape `[B, T, k]`. 503 504 Like the input, the resulting output tensors have a batch dimension. This 505 means that the original per-example shapes of `VarLenFeature`s and 506 `FixedLenSequenceFeature`s can be lost. To handle that situation, this op also 507 provides dicts of shape tensors as part of the output. There is one dict for 508 the context features, and one for the feature_list features. Context features 509 of type `FixedLenFeature`s will not be present, since their shapes are already 510 known by the caller. In situations where the input 'FixedLenFeature`s are of 511 different lengths across examples, the shorter examples will be padded with 512 default datatype values: 0 for numeric types, and the empty string for string 513 types. 514 515 Each `SparseTensor` corresponding to `sequence_features` represents a ragged 516 vector. Its indices are `[time, index]`, where `time` is the `FeatureList` 517 entry and `index` is the value's index in the list of values associated with 518 that time. 519 520 `FixedLenFeature` entries with a `default_value` and `FixedLenSequenceFeature` 521 entries with `allow_missing=True` are optional; otherwise, we will fail if 522 that `Feature` or `FeatureList` is missing from any example in `serialized`. 523 524 `example_name` may contain a descriptive name for the corresponding serialized 525 proto. This may be useful for debugging purposes, but it has no effect on the 526 output. If not `None`, `example_name` must be a scalar. 527 528 Args: 529 serialized: A vector (1-D Tensor) of type string containing binary 530 serialized `SequenceExample` protos. 531 context_features: A `dict` mapping feature keys to `FixedLenFeature` or 532 `VarLenFeature` or `RaggedFeature` values. These features are associated 533 with a `SequenceExample` as a whole. 534 sequence_features: A `dict` mapping feature keys to 535 `FixedLenSequenceFeature` or `VarLenFeature` or `RaggedFeature` values. 536 These features are associated with data within the `FeatureList` section 537 of the `SequenceExample` proto. 538 example_names: A vector (1-D Tensor) of strings (optional), the name of the 539 serialized protos. 540 name: A name for this operation (optional). 541 542 Returns: 543 A tuple of three `dict`s, each mapping keys to `Tensor`s, 544 `SparseTensor`s, and `RaggedTensor`. The first dict contains the context 545 key/values, the second dict contains the feature_list key/values, and the 546 final dict contains the lengths of any dense feature_list features. 547 548 Raises: 549 ValueError: if any feature is invalid. 550 """ 551 if not (context_features or sequence_features): 552 raise ValueError("Missing features.") 553 context_params = _ParseOpParams.from_features( 554 context_features, [VarLenFeature, FixedLenFeature, RaggedFeature]) 555 feature_list_params = _ParseOpParams.from_features( 556 sequence_features, 557 [VarLenFeature, FixedLenSequenceFeature, RaggedFeature]) 558 559 with ops.name_scope(name, "ParseSequenceExample", 560 [serialized, example_names]): 561 outputs = _parse_sequence_example_raw(serialized, example_names, 562 context_params, feature_list_params, 563 name) 564 context_output, feature_list_output, feature_list_lengths = outputs 565 566 if context_params.ragged_keys: 567 context_output = _construct_tensors_for_composite_features( 568 context_features, context_output) 569 if feature_list_params.ragged_keys: 570 feature_list_output = _construct_tensors_for_composite_features( 571 sequence_features, feature_list_output) 572 573 return context_output, feature_list_output, feature_list_lengths 574 575 576def _parse_sequence_example_raw(serialized, 577 debug_name, 578 context, 579 feature_list, 580 name=None): 581 """Parses a vector of `SequenceExample` protos. 582 583 Args: 584 serialized: A vector (1-D Tensor) of type string, containing binary 585 serialized `SequenceExample` protos. 586 debug_name: A vector (1-D Tensor) of strings (optional), the names of the 587 serialized protos. 588 context: A `ParseOpParams` containing the parameters for the parse 589 op for the context features. 590 feature_list: A `ParseOpParams` containing the parameters for the 591 parse op for the feature_list features. 592 name: A name for this operation (optional). 593 594 Returns: 595 A tuple of three `dict`s, each mapping keys to `Tensor`s, `SparseTensor`s, 596 and `RaggedTensor`s. The first dict contains the context key/values, the 597 second dict contains the feature_list key/values, and the final dict 598 contains the lengths of any dense feature_list features. 599 600 Raises: 601 TypeError: if feature_list.dense_defaults is not either None or a dict. 602 """ 603 if context.num_features + feature_list.num_features == 0: 604 raise ValueError("Must provide at least one feature key") 605 with ops.name_scope(name, "ParseSequenceExample", [serialized]): 606 debug_name = [] if debug_name is None else debug_name 607 608 # Internal 609 feature_list_dense_missing_assumed_empty = [] 610 for k, v in feature_list.dense_defaults.items(): 611 if v is not None: 612 raise ValueError("Value feature_list.dense_defaults[%s] must be None" % 613 k) 614 feature_list_dense_missing_assumed_empty.append(k) 615 616 has_ragged = context.ragged_keys or feature_list.ragged_keys 617 serialized = ops.convert_to_tensor(serialized, name="serialized") 618 if has_ragged and serialized.shape.ndims is None: 619 raise ValueError("serialized must have statically-known rank to " 620 "parse ragged features.") 621 feature_list_dense_missing_assumed_empty_vector = [ 622 key in feature_list_dense_missing_assumed_empty 623 for key in feature_list.dense_keys 624 ] 625 outputs = gen_parsing_ops.parse_sequence_example_v2( 626 # Inputs 627 serialized=serialized, 628 debug_name=debug_name, 629 context_sparse_keys=context.sparse_keys, 630 context_dense_keys=context.dense_keys, 631 context_ragged_keys=context.ragged_keys, 632 feature_list_sparse_keys=feature_list.sparse_keys, 633 feature_list_dense_keys=feature_list.dense_keys, 634 feature_list_ragged_keys=feature_list.ragged_keys, 635 feature_list_dense_missing_assumed_empty=( 636 feature_list_dense_missing_assumed_empty_vector), 637 context_dense_defaults=context.dense_defaults_vec, 638 # Attrs 639 Ncontext_sparse=len(context.sparse_keys), 640 Nfeature_list_sparse=len(feature_list.sparse_keys), 641 Nfeature_list_dense=len(feature_list.dense_keys), 642 context_sparse_types=context.sparse_types, 643 context_ragged_value_types=context.ragged_value_types, 644 context_ragged_split_types=context.ragged_split_types, 645 feature_list_dense_types=feature_list.dense_types, 646 feature_list_sparse_types=feature_list.sparse_types, 647 feature_list_ragged_value_types=feature_list.ragged_value_types, 648 feature_list_ragged_split_types=feature_list.ragged_split_types, 649 context_dense_shapes=context.dense_shapes_as_proto, 650 feature_list_dense_shapes=feature_list.dense_shapes, 651 name=name) 652 (context_sparse_indices, context_sparse_values, context_sparse_shapes, 653 context_dense_values, context_ragged_values, context_ragged_row_splits, 654 feature_list_sparse_indices, feature_list_sparse_values, 655 feature_list_sparse_shapes, feature_list_dense_values, 656 feature_list_dense_lengths, feature_list_ragged_values, 657 feature_list_ragged_outer_splits, 658 feature_list_ragged_inner_splits) = outputs 659 # pylint: disable=protected-access 660 context_ragged_tensors = parsing_config._build_ragged_tensors( 661 serialized.shape, context_ragged_values, context_ragged_row_splits) 662 feature_list_ragged_tensors = parsing_config._build_ragged_tensors( 663 serialized.shape, feature_list_ragged_values, 664 feature_list_ragged_outer_splits, feature_list_ragged_inner_splits) 665 666 # pylint: disable=g-complex-comprehension 667 context_sparse_tensors = [ 668 sparse_tensor.SparseTensor(ix, val, shape) 669 for (ix, val, 670 shape) in zip(context_sparse_indices, context_sparse_values, 671 context_sparse_shapes) 672 ] 673 674 feature_list_sparse_tensors = [ 675 sparse_tensor.SparseTensor(ix, val, shape) 676 for (ix, val, shape 677 ) in zip(feature_list_sparse_indices, feature_list_sparse_values, 678 feature_list_sparse_shapes) 679 ] 680 # pylint: enable=g-complex-comprehension 681 682 context_output = dict( 683 zip( 684 context.sparse_keys + context.dense_keys + context.ragged_keys, 685 context_sparse_tensors + context_dense_values + 686 context_ragged_tensors)) 687 feature_list_output = dict( 688 zip( 689 feature_list.sparse_keys + feature_list.dense_keys + 690 feature_list.ragged_keys, feature_list_sparse_tensors + 691 feature_list_dense_values + feature_list_ragged_tensors)) 692 feature_list_lengths = dict( 693 zip(feature_list.dense_keys, feature_list_dense_lengths)) 694 695 return (context_output, feature_list_output, feature_list_lengths) 696 697 698@tf_export("io.parse_single_sequence_example", 699 v1=["io.parse_single_sequence_example", 700 "parse_single_sequence_example"]) 701@dispatch.add_dispatch_support 702def parse_single_sequence_example( 703 serialized, context_features=None, sequence_features=None, 704 example_name=None, name=None): 705 # pylint: disable=line-too-long 706 """Parses a single `SequenceExample` proto. 707 708 Parses a single serialized [`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) 709 proto given in `serialized`. 710 711 This op parses a serialized sequence example into a tuple of dictionaries, 712 each mapping keys to `Tensor` and `SparseTensor` objects. 713 The first dictionary contains mappings for keys appearing in 714 `context_features`, and the second dictionary contains mappings for keys 715 appearing in `sequence_features`. 716 717 At least one of `context_features` and `sequence_features` must be provided 718 and non-empty. 719 720 The `context_features` keys are associated with a `SequenceExample` as a 721 whole, independent of time / frame. In contrast, the `sequence_features` keys 722 provide a way to access variable-length data within the `FeatureList` section 723 of the `SequenceExample` proto. While the shapes of `context_features` values 724 are fixed with respect to frame, the frame dimension (the first dimension) 725 of `sequence_features` values may vary between `SequenceExample` protos, 726 and even between `feature_list` keys within the same `SequenceExample`. 727 728 `context_features` contains `VarLenFeature`, `RaggedFeature`, and 729 `FixedLenFeature` objects. Each `VarLenFeature` is mapped to a `SparseTensor`; 730 each `RaggedFeature` is mapped to a `RaggedTensor`; and each `FixedLenFeature` 731 is mapped to a `Tensor`, of the specified type, shape, and default value. 732 733 `sequence_features` contains `VarLenFeature`, `RaggedFeature`, and 734 `FixedLenSequenceFeature` objects. Each `VarLenFeature` is mapped to a 735 `SparseTensor`; each `RaggedFeature` is mapped to a `RaggedTensor`; and each 736 `FixedLenSequenceFeature` is mapped to a `Tensor`, each of the specified type. 737 The shape will be `(T,) + df.dense_shape` for `FixedLenSequenceFeature` `df`, 738 where `T` is the length of the associated `FeatureList` in the 739 `SequenceExample`. For instance, `FixedLenSequenceFeature([])` yields a scalar 740 1-D `Tensor` of static shape `[None]` and dynamic shape `[T]`, while 741 `FixedLenSequenceFeature([k])` (for `int k >= 1`) yields a 2-D matrix `Tensor` 742 of static shape `[None, k]` and dynamic shape `[T, k]`. 743 744 Each `SparseTensor` corresponding to `sequence_features` represents a ragged 745 vector. Its indices are `[time, index]`, where `time` is the `FeatureList` 746 entry and `index` is the value's index in the list of values associated with 747 that time. 748 749 `FixedLenFeature` entries with a `default_value` and `FixedLenSequenceFeature` 750 entries with `allow_missing=True` are optional; otherwise, we will fail if 751 that `Feature` or `FeatureList` is missing from any example in `serialized`. 752 753 `example_name` may contain a descriptive name for the corresponding serialized 754 proto. This may be useful for debugging purposes, but it has no effect on the 755 output. If not `None`, `example_name` must be a scalar. 756 757 Note that the batch version of this function, `tf.parse_sequence_example`, 758 is written for better memory efficiency and will be faster on large 759 `SequenceExample`s. 760 761 Args: 762 serialized: A scalar (0-D Tensor) of type string, a single binary 763 serialized `SequenceExample` proto. 764 context_features: A `dict` mapping feature keys to `FixedLenFeature` or 765 `VarLenFeature` or `RaggedFeature` values. These features are associated 766 with a `SequenceExample` as a whole. 767 sequence_features: A `dict` mapping feature keys to 768 `FixedLenSequenceFeature` or `VarLenFeature` or `RaggedFeature` values. 769 These features are associated with data within the `FeatureList` section 770 of the `SequenceExample` proto. 771 example_name: A scalar (0-D Tensor) of strings (optional), the name of 772 the serialized proto. 773 name: A name for this operation (optional). 774 775 Returns: 776 A tuple of two `dict`s, each mapping keys to `Tensor`s and `SparseTensor`s 777 and `RaggedTensor`s. 778 779 * The first dict contains the context key/values. 780 * The second dict contains the feature_list key/values. 781 782 Raises: 783 ValueError: if any feature is invalid. 784 """ 785 # pylint: enable=line-too-long 786 if not (context_features or sequence_features): 787 raise ValueError("Missing features.") 788 context_params = _ParseOpParams.from_features( 789 context_features, [VarLenFeature, FixedLenFeature, RaggedFeature]) 790 feature_list_params = _ParseOpParams.from_features( 791 sequence_features, 792 [VarLenFeature, FixedLenSequenceFeature, RaggedFeature]) 793 794 with ops.name_scope(name, "ParseSingleSequenceExample", 795 [serialized, example_name]): 796 context_output, feature_list_output = ( 797 _parse_single_sequence_example_raw(serialized, context_params, 798 feature_list_params, example_name, 799 name)) 800 801 if context_params.ragged_keys: 802 context_output = _construct_tensors_for_composite_features( 803 context_features, context_output) 804 if feature_list_params.ragged_keys: 805 feature_list_output = _construct_tensors_for_composite_features( 806 sequence_features, feature_list_output) 807 808 return context_output, feature_list_output 809 810 811def _parse_single_sequence_example_raw(serialized, 812 context, 813 feature_list, 814 debug_name, 815 name=None): 816 """Parses a single `SequenceExample` proto. 817 818 Args: 819 serialized: A scalar (0-D Tensor) of type string, a single binary serialized 820 `SequenceExample` proto. 821 context: A `ParseOpParams` containing the parameters for the parse op for 822 the context features. 823 feature_list: A `ParseOpParams` containing the parameters for the parse op 824 for the feature_list features. 825 debug_name: A scalar (0-D Tensor) of strings (optional), the name of the 826 serialized proto. 827 name: A name for this operation (optional). 828 829 Returns: 830 A tuple of two `dict`s, each mapping keys to `Tensor`s and `SparseTensor`s. 831 The first dict contains the context key/values. 832 The second dict contains the feature_list key/values. 833 834 Raises: 835 TypeError: if feature_list.dense_defaults is not either None or a dict. 836 """ 837 with ops.name_scope(name, "ParseSingleExample", [serialized, debug_name]): 838 serialized = ops.convert_to_tensor(serialized, name="serialized") 839 serialized = _assert_scalar(serialized, "serialized") 840 return _parse_sequence_example_raw(serialized, debug_name, context, 841 feature_list, name)[:2] 842 843 844@tf_export("io.decode_raw", v1=[]) 845@dispatch.add_dispatch_support 846def decode_raw(input_bytes, 847 out_type, 848 little_endian=True, 849 fixed_length=None, 850 name=None): 851 """Convert raw byte strings into tensors. 852 853 Args: 854 input_bytes: 855 Each element of the input Tensor is converted to an array of bytes. 856 out_type: 857 `DType` of the output. Acceptable types are `half`, `float`, `double`, 858 `int32`, `uint16`, `uint8`, `int16`, `int8`, `int64`. 859 little_endian: 860 Whether the `input_bytes` data is in little-endian format. Data will be 861 converted into host byte order if necessary. 862 fixed_length: 863 If set, the first `fixed_length` bytes of each element will be converted. 864 Data will be zero-padded or truncated to the specified length. 865 866 `fixed_length` must be a multiple of the size of `out_type`. 867 `fixed_length` must be specified if the elements of `input_bytes` are of 868 variable length. 869 name: A name for the operation (optional). 870 871 Returns: 872 A `Tensor` object storing the decoded bytes. 873 874 """ 875 if fixed_length is not None: 876 return gen_parsing_ops.decode_padded_raw( 877 input_bytes, 878 fixed_length=fixed_length, 879 out_type=out_type, 880 little_endian=little_endian, 881 name=name) 882 else: 883 return gen_parsing_ops.decode_raw( 884 input_bytes, out_type, little_endian=little_endian, name=name) 885 886 887@tf_export(v1=["decode_raw", "io.decode_raw"]) 888@dispatch.add_dispatch_support 889@deprecation.deprecated_args(None, 890 "bytes is deprecated, use input_bytes instead", 891 "bytes") 892def decode_raw_v1( 893 input_bytes=None, 894 out_type=None, 895 little_endian=True, 896 name=None, 897 bytes=None # pylint: disable=redefined-builtin 898): 899 """Convert raw byte strings into tensors. 900 901 Args: 902 input_bytes: 903 Each element of the input Tensor is converted to an array of bytes. 904 out_type: 905 `DType` of the output. Acceptable types are `half`, `float`, `double`, 906 `int32`, `uint16`, `uint8`, `int16`, `int8`, `int64`. 907 little_endian: 908 Whether the `input_bytes` data is in little-endian format. Data will be 909 converted into host byte order if necessary. 910 name: A name for the operation (optional). 911 bytes: Deprecated parameter. Use `input_bytes` instead. 912 913 Returns: 914 A `Tensor` object storing the decoded bytes. 915 """ 916 input_bytes = deprecation.deprecated_argument_lookup("input_bytes", 917 input_bytes, "bytes", 918 bytes) 919 920 # out_type is a required positional argument in the original API, and had to 921 # be changed to a keyword argument in order to facilitate the transition from 922 # the reserved named `bytes` to `input_bytes`. Ensure it's still set. 923 if out_type is None: 924 raise ValueError( 925 "decode_raw_v1() missing 1 positional argument: 'out_type'") 926 927 return gen_parsing_ops.decode_raw( 928 input_bytes, out_type, little_endian=little_endian, name=name) 929 930 931# Swap `name` and `na_value` for backward compatibility. 932@tf_export(v1=["io.decode_csv", "decode_csv"]) 933@dispatch.add_dispatch_support 934@deprecation.deprecated_endpoints("decode_csv") 935def decode_csv(records, 936 record_defaults, 937 field_delim=",", 938 use_quote_delim=True, 939 name=None, 940 na_value="", 941 select_cols=None): 942 """Convert CSV records to tensors. Each column maps to one tensor. 943 944 RFC 4180 format is expected for the CSV records. 945 (https://tools.ietf.org/html/rfc4180) 946 Note that we allow leading and trailing spaces with int or float field. 947 948 Args: 949 records: A `Tensor` of type `string`. 950 Each string is a record/row in the csv and all records should have 951 the same format. 952 record_defaults: A list of `Tensor` objects with specific types. 953 Acceptable types are `float32`, `float64`, `int32`, `int64`, `string`. 954 One tensor per column of the input record, with either a 955 scalar default value for that column or an empty vector if the column is 956 required. 957 field_delim: An optional `string`. Defaults to `","`. 958 char delimiter to separate fields in a record. 959 use_quote_delim: An optional `bool`. Defaults to `True`. 960 If false, treats double quotation marks as regular 961 characters inside of the string fields (ignoring RFC 4180, Section 2, 962 Bullet 5). 963 name: A name for the operation (optional). 964 na_value: Additional string to recognize as NA/NaN. 965 select_cols: Optional sorted list of column indices to select. If specified, 966 only this subset of columns will be parsed and returned. 967 968 Returns: 969 A list of `Tensor` objects. Has the same type as `record_defaults`. 970 Each tensor will have the same shape as records. 971 972 Raises: 973 ValueError: If any of the arguments is malformed. 974 """ 975 return decode_csv_v2( 976 records, record_defaults, 977 field_delim, use_quote_delim, 978 na_value, select_cols, name 979 ) 980 981 982@tf_export("io.decode_csv", v1=[]) 983@dispatch.add_dispatch_support 984def decode_csv_v2(records, 985 record_defaults, 986 field_delim=",", 987 use_quote_delim=True, 988 na_value="", 989 select_cols=None, 990 name=None): 991 """Convert CSV records to tensors. Each column maps to one tensor. 992 993 RFC 4180 format is expected for the CSV records. 994 (https://tools.ietf.org/html/rfc4180) 995 Note that we allow leading and trailing spaces with int or float field. 996 997 Args: 998 records: A `Tensor` of type `string`. 999 Each string is a record/row in the csv and all records should have 1000 the same format. 1001 record_defaults: A list of `Tensor` objects with specific types. 1002 Acceptable types are `float32`, `float64`, `int32`, `int64`, `string`. 1003 One tensor per column of the input record, with either a 1004 scalar default value for that column or an empty vector if the column is 1005 required. 1006 field_delim: An optional `string`. Defaults to `","`. 1007 char delimiter to separate fields in a record. 1008 use_quote_delim: An optional `bool`. Defaults to `True`. 1009 If false, treats double quotation marks as regular 1010 characters inside of the string fields (ignoring RFC 4180, Section 2, 1011 Bullet 5). 1012 na_value: Additional string to recognize as NA/NaN. 1013 select_cols: Optional sorted list of column indices to select. If specified, 1014 only this subset of columns will be parsed and returned. 1015 name: A name for the operation (optional). 1016 1017 Returns: 1018 A list of `Tensor` objects. Has the same type as `record_defaults`. 1019 Each tensor will have the same shape as records. 1020 1021 Raises: 1022 ValueError: If any of the arguments is malformed. 1023 """ 1024 if select_cols is not None and any(select_cols[i] >= select_cols[i + 1] 1025 for i in range(len(select_cols) - 1)): 1026 raise ValueError("select_cols is not strictly increasing.") 1027 if select_cols is not None and select_cols[0] < 0: 1028 raise ValueError("select_cols contains negative values.") 1029 if select_cols is not None and len(select_cols) != len(record_defaults): 1030 raise ValueError("Length of select_cols and record_defaults do not match.") 1031 return gen_parsing_ops.decode_csv( 1032 records=records, 1033 record_defaults=record_defaults, 1034 field_delim=field_delim, 1035 use_quote_delim=use_quote_delim, 1036 na_value=na_value, 1037 name=name, 1038 select_cols=select_cols, 1039 ) 1040 1041 1042def _assert_scalar(value, name): 1043 """Asserts that `value` is scalar, and returns `value`.""" 1044 value_rank = value.shape.rank 1045 if value_rank is None: 1046 check = control_flow_ops.Assert( 1047 math_ops.equal(array_ops.rank(value), 0), 1048 ["Input %s must be a scalar" % name], 1049 name="%sIsScalar" % name.capitalize()) 1050 result = control_flow_ops.with_dependencies([check], 1051 value, 1052 name="%sDependencies" % name) 1053 result.set_shape([]) 1054 return result 1055 elif value_rank == 0: 1056 return value 1057 else: 1058 raise ValueError("Input %s must be a scalar" % name) 1059 1060 1061@tf_export("io.decode_json_example", 1062 v1=["decode_json_example", "io.decode_json_example"]) 1063def decode_json_example(json_examples, name=None): 1064 r"""Convert JSON-encoded Example records to binary protocol buffer strings. 1065 1066 Note: This is **not** a general purpose JSON parsing op. 1067 1068 This op converts JSON-serialized `tf.train.Example` (maybe created with 1069 `json_format.MessageToJson`, following the 1070 [standard JSON mapping]( 1071 https://developers.google.com/protocol-buffers/docs/proto3#json)) 1072 to a binary-serialized `tf.train.Example` (equivalent to 1073 `Example.SerializeToString()`) suitable for conversion to tensors with 1074 `tf.io.parse_example`. 1075 1076 Here is a `tf.train.Example` proto: 1077 1078 >>> example = tf.train.Example( 1079 ... features=tf.train.Features( 1080 ... feature={ 1081 ... "a": tf.train.Feature( 1082 ... int64_list=tf.train.Int64List( 1083 ... value=[1, 1, 3]))})) 1084 1085 Here it is converted to JSON: 1086 1087 >>> from google.protobuf import json_format 1088 >>> example_json = json_format.MessageToJson(example) 1089 >>> print(example_json) 1090 { 1091 "features": { 1092 "feature": { 1093 "a": { 1094 "int64List": { 1095 "value": [ 1096 "1", 1097 "1", 1098 "3" 1099 ] 1100 } 1101 } 1102 } 1103 } 1104 } 1105 1106 This op converts the above json string to a binary proto: 1107 1108 >>> example_binary = tf.io.decode_json_example(example_json) 1109 >>> example_binary.numpy() 1110 b'\n\x0f\n\r\n\x01a\x12\x08\x1a\x06\x08\x01\x08\x01\x08\x03' 1111 1112 The OP works on string tensors of andy shape: 1113 1114 >>> tf.io.decode_json_example([ 1115 ... [example_json, example_json], 1116 ... [example_json, example_json]]).shape.as_list() 1117 [2, 2] 1118 1119 This resulting binary-string is equivalent to `Example.SerializeToString()`, 1120 and can be converted to Tensors using `tf.io.parse_example` and related 1121 functions: 1122 1123 >>> tf.io.parse_example( 1124 ... serialized=[example_binary.numpy(), 1125 ... example.SerializeToString()], 1126 ... features = {'a': tf.io.FixedLenFeature(shape=[3], dtype=tf.int64)}) 1127 {'a': <tf.Tensor: shape=(2, 3), dtype=int64, numpy= 1128 array([[1, 1, 3], 1129 [1, 1, 3]])>} 1130 1131 Args: 1132 json_examples: A string tensor containing json-serialized `tf.Example` 1133 protos. 1134 name: A name for the op. 1135 1136 Returns: 1137 A string Tensor containing the binary-serialized `tf.Example` protos. 1138 1139 Raises: 1140 `tf.errors.InvalidArgumentError`: If the JSON could not be converted to a 1141 `tf.Example` 1142 """ 1143 return gen_parsing_ops.decode_json_example(json_examples, name=name) 1144