1# Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""This API defines FeatureColumn abstraction. 16 17FeatureColumns provide a high level abstraction for ingesting and representing 18features. FeatureColumns are also the primary way of encoding features for 19canned ${tf.estimator.Estimator}s. 20 21When using FeatureColumns with `Estimators`, the type of feature column you 22should choose depends on (1) the feature type and (2) the model type. 23 241. Feature type: 25 26 * Continuous features can be represented by `numeric_column`. 27 * Categorical features can be represented by any `categorical_column_with_*` 28 column: 29 - `categorical_column_with_vocabulary_list` 30 - `categorical_column_with_vocabulary_file` 31 - `categorical_column_with_hash_bucket` 32 - `categorical_column_with_identity` 33 - `weighted_categorical_column` 34 352. Model type: 36 37 * Deep neural network models (`DNNClassifier`, `DNNRegressor`). 38 39 Continuous features can be directly fed into deep neural network models. 40 41 age_column = numeric_column("age") 42 43 To feed sparse features into DNN models, wrap the column with 44 `embedding_column` or `indicator_column`. `indicator_column` is recommended 45 for features with only a few possible values. For features with many 46 possible values, to reduce the size of your model, `embedding_column` is 47 recommended. 48 49 embedded_dept_column = embedding_column( 50 categorical_column_with_vocabulary_list( 51 "department", ["math", "philosphy", ...]), dimension=10) 52 53 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`). 54 55 Sparse features can be fed directly into linear models. They behave like an 56 indicator column but with an efficient implementation. 57 58 dept_column = categorical_column_with_vocabulary_list("department", 59 ["math", "philosophy", "english"]) 60 61 It is recommended that continuous features be bucketized before being 62 fed into linear models. 63 64 bucketized_age_column = bucketized_column( 65 source_column=age_column, 66 boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) 67 68 Sparse features can be crossed (also known as conjuncted or combined) in 69 order to form non-linearities, and then fed into linear models. 70 71 cross_dept_age_column = crossed_column( 72 columns=["department", bucketized_age_column], 73 hash_bucket_size=1000) 74 75Example of building canned `Estimator`s using FeatureColumns: 76 77 ```python 78 # Define features and transformations 79 deep_feature_columns = [age_column, embedded_dept_column] 80 wide_feature_columns = [dept_column, bucketized_age_column, 81 cross_dept_age_column] 82 83 # Build deep model 84 estimator = DNNClassifier( 85 feature_columns=deep_feature_columns, 86 hidden_units=[500, 250, 50]) 87 estimator.train(...) 88 89 # Or build a wide model 90 estimator = LinearClassifier( 91 feature_columns=wide_feature_columns) 92 estimator.train(...) 93 94 # Or build a wide and deep model! 95 estimator = DNNLinearCombinedClassifier( 96 linear_feature_columns=wide_feature_columns, 97 dnn_feature_columns=deep_feature_columns, 98 dnn_hidden_units=[500, 250, 50]) 99 estimator.train(...) 100 ``` 101 102 103FeatureColumns can also be transformed into a generic input layer for 104custom models using `input_layer`. 105 106Example of building model using FeatureColumns, this can be used in a 107`model_fn` which is given to the {tf.estimator.Estimator}: 108 109 ```python 110 # Building model via layers 111 112 deep_feature_columns = [age_column, embedded_dept_column] 113 columns_to_tensor = parse_feature_columns_from_examples( 114 serialized=my_data, 115 feature_columns=deep_feature_columns) 116 first_layer = input_layer( 117 features=columns_to_tensor, 118 feature_columns=deep_feature_columns) 119 second_layer = fully_connected(first_layer, ...) 120 ``` 121 122NOTE: Functions prefixed with "_" indicate experimental or private parts of 123the API subject to change, and should not be relied upon! 124""" 125 126from __future__ import absolute_import 127from __future__ import division 128from __future__ import print_function 129 130import abc 131import collections 132import math 133 134import numpy as np 135import six 136 137 138from tensorflow.python.framework import dtypes 139from tensorflow.python.framework import ops 140from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib 141from tensorflow.python.framework import tensor_shape 142from tensorflow.python.ops import array_ops 143from tensorflow.python.ops import check_ops 144from tensorflow.python.ops import control_flow_ops 145from tensorflow.python.ops import embedding_ops 146from tensorflow.python.ops import init_ops 147from tensorflow.python.ops import lookup_ops 148from tensorflow.python.ops import math_ops 149from tensorflow.python.ops import nn_ops 150from tensorflow.python.ops import parsing_ops 151from tensorflow.python.ops import sparse_ops 152from tensorflow.python.ops import string_ops 153from tensorflow.python.ops import template 154from tensorflow.python.ops import variable_scope 155from tensorflow.python.ops import variables 156from tensorflow.python.platform import gfile 157from tensorflow.python.platform import tf_logging as logging 158from tensorflow.python.training import checkpoint_utils 159from tensorflow.python.util import nest 160from tensorflow.python.util.tf_export import tf_export 161from tensorflow.python.util.tf_export import tf_export 162 163 164def _internal_input_layer(features, 165 feature_columns, 166 weight_collections=None, 167 trainable=True, 168 cols_to_vars=None, 169 scope=None): 170 """See input_layer. `scope` is a name or variable scope to use.""" 171 172 feature_columns = _clean_feature_columns(feature_columns) 173 for column in feature_columns: 174 if not isinstance(column, _DenseColumn): 175 raise ValueError( 176 'Items of feature_columns must be a _DenseColumn. ' 177 'You can wrap a categorical column with an ' 178 'embedding_column or indicator_column. Given: {}'.format(column)) 179 weight_collections = list(weight_collections or []) 180 if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections: 181 weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES) 182 if ops.GraphKeys.MODEL_VARIABLES not in weight_collections: 183 weight_collections.append(ops.GraphKeys.MODEL_VARIABLES) 184 185 # a non-None `scope` can allow for variable reuse, when, e.g., this function 186 # is wrapped by a `make_template`. 187 with variable_scope.variable_scope( 188 scope, default_name='input_layer', values=features.values()): 189 builder = _LazyBuilder(features) 190 output_tensors = [] 191 ordered_columns = [] 192 for column in sorted(feature_columns, key=lambda x: x.name): 193 ordered_columns.append(column) 194 with variable_scope.variable_scope( 195 None, default_name=column._var_scope_name): # pylint: disable=protected-access 196 tensor = column._get_dense_tensor( # pylint: disable=protected-access 197 builder, 198 weight_collections=weight_collections, 199 trainable=trainable) 200 num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access 201 batch_size = array_ops.shape(tensor)[0] 202 output_tensors.append( 203 array_ops.reshape(tensor, shape=(batch_size, num_elements))) 204 if cols_to_vars is not None: 205 # Retrieve any variables created (some _DenseColumn's don't create 206 # variables, in which case an empty list is returned). 207 cols_to_vars[column] = ops.get_collection( 208 ops.GraphKeys.GLOBAL_VARIABLES, 209 scope=variable_scope.get_variable_scope().name) 210 _verify_static_batch_size_equality(output_tensors, ordered_columns) 211 return array_ops.concat(output_tensors, 1) 212 213 214@tf_export('feature_column.input_layer') 215def input_layer(features, 216 feature_columns, 217 weight_collections=None, 218 trainable=True, 219 cols_to_vars=None): 220 """Returns a dense `Tensor` as input layer based on given `feature_columns`. 221 222 Generally a single example in training data is described with FeatureColumns. 223 At the first layer of the model, this column oriented data should be converted 224 to a single `Tensor`. 225 226 Example: 227 228 ```python 229 price = numeric_column('price') 230 keywords_embedded = embedding_column( 231 categorical_column_with_hash_bucket("keywords", 10K), dimensions=16) 232 columns = [price, keywords_embedded, ...] 233 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 234 dense_tensor = input_layer(features, columns) 235 for units in [128, 64, 32]: 236 dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu) 237 prediction = tf.layers.dense(dense_tensor, 1) 238 ``` 239 240 Args: 241 features: A mapping from key to tensors. `_FeatureColumn`s look up via these 242 keys. For example `numeric_column('price')` will look at 'price' key in 243 this dict. Values can be a `SparseTensor` or a `Tensor` depends on 244 corresponding `_FeatureColumn`. 245 feature_columns: An iterable containing the FeatureColumns to use as inputs 246 to your model. All items should be instances of classes derived from 247 `_DenseColumn` such as `numeric_column`, `embedding_column`, 248 `bucketized_column`, `indicator_column`. If you have categorical features, 249 you can wrap them with an `embedding_column` or `indicator_column`. 250 weight_collections: A list of collection names to which the Variable will be 251 added. Note that variables will also be added to collections 252 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`. 253 trainable: If `True` also add the variable to the graph collection 254 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 255 cols_to_vars: If not `None`, must be a dictionary that will be filled with a 256 mapping from `_FeatureColumn` to list of `Variable`s. For example, after 257 the call, we might have cols_to_vars = 258 {_EmbeddingColumn( 259 categorical_column=_HashedCategoricalColumn( 260 key='sparse_feature', hash_bucket_size=5, dtype=tf.string), 261 dimension=10): [<tf.Variable 'some_variable:0' shape=(5, 10), 262 <tf.Variable 'some_variable:1' shape=(5, 10)]} 263 If a column creates no variables, its value will be an empty list. 264 265 Returns: 266 A `Tensor` which represents input layer of a model. Its shape 267 is (batch_size, first_layer_dimension) and its dtype is `float32`. 268 first_layer_dimension is determined based on given `feature_columns`. 269 270 Raises: 271 ValueError: if an item in `feature_columns` is not a `_DenseColumn`. 272 """ 273 return _internal_input_layer(features, feature_columns, weight_collections, 274 trainable, cols_to_vars) 275 276 277# TODO(akshayka): InputLayer should be a subclass of Layer, and it 278# should implement the logic in input_layer using Layer's build-and-call 279# paradigm; input_layer should create an instance of InputLayer and 280# return the result of inovking its apply method, just as functional layers do. 281class InputLayer(object): 282 """An object-oriented version of `input_layer` that reuses variables.""" 283 284 def __init__(self, 285 feature_columns, 286 weight_collections=None, 287 trainable=True, 288 cols_to_vars=None): 289 """See `input_layer`.""" 290 291 self._feature_columns = feature_columns 292 self._weight_collections = weight_collections 293 self._trainable = trainable 294 self._cols_to_vars = cols_to_vars 295 self._input_layer_template = template.make_template( 296 'feature_column_input_layer', 297 _internal_input_layer, 298 create_scope_now_=True) 299 self._scope = self._input_layer_template.variable_scope 300 301 def __call__(self, features): 302 return self._input_layer_template( 303 features=features, 304 feature_columns=self._feature_columns, 305 weight_collections=self._weight_collections, 306 trainable=self._trainable, 307 cols_to_vars=None, 308 scope=self._scope) 309 310 @property 311 def non_trainable_variables(self): 312 return self._input_layer_template.non_trainable_variables 313 314 @property 315 def non_trainable_weights(self): 316 return self._input_layer_template.non_trainable_weights 317 318 @property 319 def trainable_variables(self): 320 return self._input_layer_template.trainable_variables 321 322 @property 323 def trainable_weights(self): 324 return self._input_layer_template.trainable_weights 325 326 @property 327 def variables(self): 328 return self._input_layer_template.variables 329 330 @property 331 def weights(self): 332 return self._input_layer_template.weights 333 334 335@tf_export('feature_column.linear_model') 336def linear_model(features, 337 feature_columns, 338 units=1, 339 sparse_combiner='sum', 340 weight_collections=None, 341 trainable=True, 342 cols_to_vars=None): 343 """Returns a linear prediction `Tensor` based on given `feature_columns`. 344 345 This function generates a weighted sum based on output dimension `units`. 346 Weighted sum refers to logits in classification problems. It refers to the 347 prediction itself for linear regression problems. 348 349 Note on supported columns: `linear_model` treats categorical columns as 350 `indicator_column`s while `input_layer` explicitly requires wrapping each 351 of them with an `embedding_column` or an `indicator_column`. 352 353 Example: 354 355 ```python 356 price = numeric_column('price') 357 price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.]) 358 keywords = categorical_column_with_hash_bucket("keywords", 10K) 359 keywords_price = crossed_column('keywords', price_buckets, ...) 360 columns = [price_buckets, keywords, keywords_price ...] 361 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 362 prediction = linear_model(features, columns) 363 ``` 364 365 Args: 366 features: A mapping from key to tensors. `_FeatureColumn`s look up via these 367 keys. For example `numeric_column('price')` will look at 'price' key in 368 this dict. Values are `Tensor` or `SparseTensor` depending on 369 corresponding `_FeatureColumn`. 370 feature_columns: An iterable containing the FeatureColumns to use as inputs 371 to your model. All items should be instances of classes derived from 372 `_FeatureColumn`s. 373 units: An integer, dimensionality of the output space. Default value is 1. 374 sparse_combiner: A string specifying how to reduce if a sparse column is 375 multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" 376 the default. "sqrtn" often achieves good accuracy, in particular with 377 bag-of-words columns. It combines each sparse columns independently. 378 * "sum": do not normalize features in the column 379 * "mean": do l1 normalization on features in the column 380 * "sqrtn": do l2 normalization on features in the column 381 weight_collections: A list of collection names to which the Variable will be 382 added. Note that, variables will also be added to collections 383 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`. 384 trainable: If `True` also add the variable to the graph collection 385 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 386 cols_to_vars: If not `None`, must be a dictionary that will be filled with a 387 mapping from `_FeatureColumn` to associated list of `Variable`s. For 388 example, after the call, we might have cols_to_vars = { 389 _NumericColumn( 390 key='numeric_feature1', shape=(1,): 391 [<tf.Variable 'linear_model/price2/weights:0' shape=(1, 1)>], 392 'bias': [<tf.Variable 'linear_model/bias_weights:0' shape=(1,)>], 393 _NumericColumn( 394 key='numeric_feature2', shape=(2,)): 395 [<tf.Variable 'linear_model/price1/weights:0' shape=(2, 1)>]} 396 If a column creates no variables, its value will be an empty list. Note 397 that cols_to_vars will also contain a string key 'bias' that maps to a 398 list of Variables. 399 400 Returns: 401 A `Tensor` which represents predictions/logits of a linear model. Its shape 402 is (batch_size, units) and its dtype is `float32`. 403 404 Raises: 405 ValueError: if an item in `feature_columns` is neither a `_DenseColumn` 406 nor `_CategoricalColumn`. 407 """ 408 feature_columns = _clean_feature_columns(feature_columns) 409 for column in feature_columns: 410 if not isinstance(column, (_DenseColumn, _CategoricalColumn)): 411 raise ValueError('Items of feature_columns must be either a _DenseColumn ' 412 'or _CategoricalColumn. Given: {}'.format(column)) 413 weight_collections = list(weight_collections or []) 414 if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections: 415 weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES) 416 if ops.GraphKeys.MODEL_VARIABLES not in weight_collections: 417 weight_collections.append(ops.GraphKeys.MODEL_VARIABLES) 418 with variable_scope.variable_scope( 419 None, default_name='linear_model', values=features.values()): 420 weighted_sums = [] 421 ordered_columns = [] 422 builder = _LazyBuilder(features) 423 for column in sorted(feature_columns, key=lambda x: x.name): 424 with variable_scope.variable_scope( 425 None, default_name=column._var_scope_name): # pylint: disable=protected-access 426 ordered_columns.append(column) 427 weighted_sum = _create_weighted_sum( 428 column=column, 429 builder=builder, 430 units=units, 431 sparse_combiner=sparse_combiner, 432 weight_collections=weight_collections, 433 trainable=trainable) 434 weighted_sums.append(weighted_sum) 435 if cols_to_vars is not None: 436 # Retrieve the variables created. 437 cols_to_vars[column] = ops.get_collection( 438 ops.GraphKeys.GLOBAL_VARIABLES, 439 scope=variable_scope.get_variable_scope().name) 440 _verify_static_batch_size_equality(weighted_sums, ordered_columns) 441 predictions_no_bias = math_ops.add_n( 442 weighted_sums, name='weighted_sum_no_bias') 443 bias = variable_scope.get_variable( 444 'bias_weights', 445 shape=[units], 446 initializer=init_ops.zeros_initializer(), 447 trainable=trainable, 448 collections=weight_collections) 449 predictions = nn_ops.bias_add( 450 predictions_no_bias, bias, name='weighted_sum') 451 if cols_to_vars is not None: 452 # Add the bias to cols_to_vars as well, converting the Variable or 453 # PartitionedVariable to a list of Variable's. 454 if isinstance(bias, variables.Variable): 455 cols_to_vars['bias'] = [bias] 456 else: # Must be a PartitionedVariable. 457 cols_to_vars['bias'] = list(bias) 458 return predictions 459 460 461def _transform_features(features, feature_columns): 462 """Returns transformed features based on features columns passed in. 463 464 Please note that most probably you would not need to use this function. Please 465 check `input_layer` and `linear_model` to see whether they will 466 satisfy your use case or not. 467 468 Example: 469 470 ```python 471 # Define features and transformations 472 crosses_a_x_b = crossed_column( 473 columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000) 474 price_buckets = bucketized_column( 475 source_column=numeric_column("price"), boundaries=[...]) 476 477 columns = [crosses_a_x_b, price_buckets] 478 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 479 transformed = transform_features(features=features, feature_columns=columns) 480 481 assertCountEqual(columns, transformed.keys()) 482 ``` 483 484 Args: 485 features: A mapping from key to tensors. `_FeatureColumn`s look up via these 486 keys. For example `numeric_column('price')` will look at 'price' key in 487 this dict. Values can be a `SparseTensor` or a `Tensor` depends on 488 corresponding `_FeatureColumn`. 489 feature_columns: An iterable containing all the `_FeatureColumn`s. 490 491 Returns: 492 A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values. 493 """ 494 feature_columns = _clean_feature_columns(feature_columns) 495 outputs = {} 496 with ops.name_scope( 497 None, default_name='transform_features', values=features.values()): 498 builder = _LazyBuilder(features) 499 for column in sorted(feature_columns, key=lambda x: x.name): 500 with ops.name_scope(None, default_name=column.name): 501 outputs[column] = builder.get(column) 502 return outputs 503 504 505@tf_export('feature_column.make_parse_example_spec') 506def make_parse_example_spec(feature_columns): 507 """Creates parsing spec dictionary from input feature_columns. 508 509 The returned dictionary can be used as arg 'features' in `tf.parse_example`. 510 511 Typical usage example: 512 513 ```python 514 # Define features and transformations 515 feature_a = categorical_column_with_vocabulary_file(...) 516 feature_b = numeric_column(...) 517 feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...) 518 feature_a_x_feature_c = crossed_column( 519 columns=["feature_a", feature_c_bucketized], ...) 520 521 feature_columns = set( 522 [feature_b, feature_c_bucketized, feature_a_x_feature_c]) 523 features = tf.parse_example( 524 serialized=serialized_examples, 525 features=make_parse_example_spec(feature_columns)) 526 ``` 527 528 For the above example, make_parse_example_spec would return the dict: 529 530 ```python 531 { 532 "feature_a": parsing_ops.VarLenFeature(tf.string), 533 "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32), 534 "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32) 535 } 536 ``` 537 538 Args: 539 feature_columns: An iterable containing all feature columns. All items 540 should be instances of classes derived from `_FeatureColumn`. 541 542 Returns: 543 A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature` 544 value. 545 546 Raises: 547 ValueError: If any of the given `feature_columns` is not a `_FeatureColumn` 548 instance. 549 """ 550 result = {} 551 for column in feature_columns: 552 if not isinstance(column, _FeatureColumn): 553 raise ValueError( 554 'All feature_columns must be _FeatureColumn instances. ' 555 'Given: {}'.format(column)) 556 config = column._parse_example_spec # pylint: disable=protected-access 557 for key, value in six.iteritems(config): 558 if key in result and value != result[key]: 559 raise ValueError( 560 'feature_columns contain different parse_spec for key ' 561 '{}. Given {} and {}'.format(key, value, result[key])) 562 result.update(config) 563 return result 564 565 566@tf_export('feature_column.embedding_column') 567def embedding_column( 568 categorical_column, dimension, combiner='mean', initializer=None, 569 ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, 570 trainable=True): 571 """`_DenseColumn` that converts from sparse, categorical input. 572 573 Use this when your inputs are sparse, but you want to convert them to a dense 574 representation (e.g., to feed to a DNN). 575 576 Inputs must be a `_CategoricalColumn` created by any of the 577 `categorical_column_*` function. Here is an example of using 578 `embedding_column` with `DNNClassifier`: 579 580 ```python 581 video_id = categorical_column_with_identity( 582 key='video_id', num_buckets=1000000, default_value=0) 583 columns = [embedding_column(video_id, 9),...] 584 585 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...) 586 587 label_column = ... 588 def input_fn(): 589 features = tf.parse_example( 590 ..., features=make_parse_example_spec(columns + [label_column])) 591 labels = features.pop(label_column.name) 592 return features, labels 593 594 estimator.train(input_fn=input_fn, steps=100) 595 ``` 596 597 Here is an example using `embedding_column` with model_fn: 598 599 ```python 600 def model_fn(features, ...): 601 video_id = categorical_column_with_identity( 602 key='video_id', num_buckets=1000000, default_value=0) 603 columns = [embedding_column(video_id, 9),...] 604 dense_tensor = input_layer(features, columns) 605 # Form DNN layers, calculate loss, and return EstimatorSpec. 606 ... 607 ``` 608 609 Args: 610 categorical_column: A `_CategoricalColumn` created by a 611 `categorical_column_with_*` function. This column produces the sparse IDs 612 that are inputs to the embedding lookup. 613 dimension: An integer specifying dimension of the embedding, must be > 0. 614 combiner: A string specifying how to reduce if there are multiple entries 615 in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with 616 'mean' the default. 'sqrtn' often achieves good accuracy, in particular 617 with bag-of-words columns. Each of this can be thought as example level 618 normalizations on the column. For more information, see 619 `tf.embedding_lookup_sparse`. 620 initializer: A variable initializer function to be used in embedding 621 variable initialization. If not specified, defaults to 622 `tf.truncated_normal_initializer` with mean `0.0` and standard deviation 623 `1/sqrt(dimension)`. 624 ckpt_to_load_from: String representing checkpoint name/pattern from which to 625 restore column weights. Required if `tensor_name_in_ckpt` is not `None`. 626 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from 627 which to restore the column weights. Required if `ckpt_to_load_from` is 628 not `None`. 629 max_norm: If not `None`, embedding values are l2-normalized to this value. 630 trainable: Whether or not the embedding is trainable. Default is True. 631 632 Returns: 633 `_DenseColumn` that converts from sparse input. 634 635 Raises: 636 ValueError: if `dimension` not > 0. 637 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt` 638 is specified. 639 ValueError: if `initializer` is specified and is not callable. 640 RuntimeError: If eager execution is enabled. 641 """ 642 if (dimension is None) or (dimension < 1): 643 raise ValueError('Invalid dimension {}.'.format(dimension)) 644 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None): 645 raise ValueError('Must specify both `ckpt_to_load_from` and ' 646 '`tensor_name_in_ckpt` or none of them.') 647 648 if (initializer is not None) and (not callable(initializer)): 649 raise ValueError('initializer must be callable if specified. ' 650 'Embedding of column_name: {}'.format( 651 categorical_column.name)) 652 if initializer is None: 653 initializer = init_ops.truncated_normal_initializer( 654 mean=0.0, stddev=1 / math.sqrt(dimension)) 655 656 return _EmbeddingColumn( 657 categorical_column=categorical_column, 658 dimension=dimension, 659 combiner=combiner, 660 initializer=initializer, 661 ckpt_to_load_from=ckpt_to_load_from, 662 tensor_name_in_ckpt=tensor_name_in_ckpt, 663 max_norm=max_norm, 664 trainable=trainable) 665 666 667@tf_export('feature_column.shared_embedding_columns') 668def shared_embedding_columns( 669 categorical_columns, dimension, combiner='mean', initializer=None, 670 shared_embedding_collection_name=None, ckpt_to_load_from=None, 671 tensor_name_in_ckpt=None, max_norm=None, trainable=True): 672 """List of dense columns that convert from sparse, categorical input. 673 674 This is similar to `embedding_column`, except that that it produces a list of 675 embedding columns that share the same embedding weights. 676 677 Use this when your inputs are sparse and of the same type (e.g. watched and 678 impression video IDs that share the same vocabulary), and you want to convert 679 them to a dense representation (e.g., to feed to a DNN). 680 681 Inputs must be a list of categorical columns created by any of the 682 `categorical_column_*` function. They must all be of the same type and have 683 the same arguments except `key`. E.g. they can be 684 categorical_column_with_vocabulary_file with the same vocabulary_file. Some or 685 all columns could also be weighted_categorical_column. 686 687 Here is an example embedding of two features for a DNNClassifier model: 688 689 ```python 690 watched_video_id = categorical_column_with_vocabulary_file( 691 'watched_video_id', video_vocabulary_file, video_vocabulary_size) 692 impression_video_id = categorical_column_with_vocabulary_file( 693 'impression_video_id', video_vocabulary_file, video_vocabulary_size) 694 columns = shared_embedding_columns( 695 [watched_video_id, impression_video_id], dimension=10) 696 697 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...) 698 699 label_column = ... 700 def input_fn(): 701 features = tf.parse_example( 702 ..., features=make_parse_example_spec(columns + [label_column])) 703 labels = features.pop(label_column.name) 704 return features, labels 705 706 estimator.train(input_fn=input_fn, steps=100) 707 ``` 708 709 Here is an example using `shared_embedding_columns` with model_fn: 710 711 ```python 712 def model_fn(features, ...): 713 watched_video_id = categorical_column_with_vocabulary_file( 714 'watched_video_id', video_vocabulary_file, video_vocabulary_size) 715 impression_video_id = categorical_column_with_vocabulary_file( 716 'impression_video_id', video_vocabulary_file, video_vocabulary_size) 717 columns = shared_embedding_columns( 718 [watched_video_id, impression_video_id], dimension=10) 719 dense_tensor = input_layer(features, columns) 720 # Form DNN layers, calculate loss, and return EstimatorSpec. 721 ... 722 ``` 723 724 Args: 725 categorical_columns: List of categorical columns created by a 726 `categorical_column_with_*` function. These columns produce the sparse IDs 727 that are inputs to the embedding lookup. All columns must be of the same 728 type and have the same arguments except `key`. E.g. they can be 729 categorical_column_with_vocabulary_file with the same vocabulary_file. 730 Some or all columns could also be weighted_categorical_column. 731 dimension: An integer specifying dimension of the embedding, must be > 0. 732 combiner: A string specifying how to reduce if there are multiple entries 733 in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with 734 'mean' the default. 'sqrtn' often achieves good accuracy, in particular 735 with bag-of-words columns. Each of this can be thought as example level 736 normalizations on the column. For more information, see 737 `tf.embedding_lookup_sparse`. 738 initializer: A variable initializer function to be used in embedding 739 variable initialization. If not specified, defaults to 740 `tf.truncated_normal_initializer` with mean `0.0` and standard deviation 741 `1/sqrt(dimension)`. 742 shared_embedding_collection_name: Optional name of the collection where 743 shared embedding weights are added. If not given, a reasonable name will 744 be chosen based on the names of `categorical_columns`. This is also used 745 in `variable_scope` when creating shared embedding weights. 746 ckpt_to_load_from: String representing checkpoint name/pattern from which to 747 restore column weights. Required if `tensor_name_in_ckpt` is not `None`. 748 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from 749 which to restore the column weights. Required if `ckpt_to_load_from` is 750 not `None`. 751 max_norm: If not `None`, embedding values are l2-normalized to this value. 752 trainable: Whether or not the embedding is trainable. Default is True. 753 754 Returns: 755 A list of dense columns that converts from sparse input. The order of 756 results follows the ordering of `categorical_columns`. 757 758 Raises: 759 ValueError: if `dimension` not > 0. 760 ValueError: if any of the given `categorical_columns` is of different type 761 or has different arguments than the others. 762 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt` 763 is specified. 764 ValueError: if `initializer` is specified and is not callable. 765 """ 766 if (dimension is None) or (dimension < 1): 767 raise ValueError('Invalid dimension {}.'.format(dimension)) 768 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None): 769 raise ValueError('Must specify both `ckpt_to_load_from` and ' 770 '`tensor_name_in_ckpt` or none of them.') 771 772 if (initializer is not None) and (not callable(initializer)): 773 raise ValueError('initializer must be callable if specified.') 774 if initializer is None: 775 initializer = init_ops.truncated_normal_initializer( 776 mean=0.0, stddev=1. / math.sqrt(dimension)) 777 778 # Sort the columns so the default collection name is deterministic even if the 779 # user passes columns from an unsorted collection, such as dict.values(). 780 sorted_columns = sorted(categorical_columns, key=lambda x: x.name) 781 782 c0 = sorted_columns[0] 783 if not isinstance(c0, _CategoricalColumn): 784 raise ValueError( 785 'All categorical_columns must be subclasses of _CategoricalColumn. ' 786 'Given: {}, of type: {}'.format(c0, type(c0))) 787 if isinstance(c0, _WeightedCategoricalColumn): 788 c0 = c0.categorical_column 789 for c in sorted_columns[1:]: 790 if isinstance(c, _WeightedCategoricalColumn): 791 c = c.categorical_column 792 if not isinstance(c, type(c0)): 793 raise ValueError( 794 'To use shared_embedding_column, all categorical_columns must have ' 795 'the same type, or be weighted_categorical_column of the same type. ' 796 'Given column: {} of type: {} does not match given column: {} of ' 797 'type: {}'.format(c0, type(c0), c, type(c))) 798 799 if not shared_embedding_collection_name: 800 shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns) 801 shared_embedding_collection_name += '_shared_embedding' 802 803 result = [] 804 for column in categorical_columns: 805 result.append(_SharedEmbeddingColumn( 806 categorical_column=column, 807 dimension=dimension, 808 combiner=combiner, 809 initializer=initializer, 810 shared_embedding_collection_name=shared_embedding_collection_name, 811 ckpt_to_load_from=ckpt_to_load_from, 812 tensor_name_in_ckpt=tensor_name_in_ckpt, 813 max_norm=max_norm, 814 trainable=trainable)) 815 return result 816 817 818@tf_export('feature_column.numeric_column') 819def numeric_column(key, 820 shape=(1,), 821 default_value=None, 822 dtype=dtypes.float32, 823 normalizer_fn=None): 824 """Represents real valued or numerical features. 825 826 Example: 827 828 ```python 829 price = numeric_column('price') 830 columns = [price, ...] 831 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 832 dense_tensor = input_layer(features, columns) 833 834 # or 835 bucketized_price = bucketized_column(price, boundaries=[...]) 836 columns = [bucketized_price, ...] 837 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 838 linear_prediction = linear_model(features, columns) 839 ``` 840 841 Args: 842 key: A unique string identifying the input feature. It is used as the 843 column name and the dictionary key for feature parsing configs, feature 844 `Tensor` objects, and feature columns. 845 shape: An iterable of integers specifies the shape of the `Tensor`. An 846 integer can be given which means a single dimension `Tensor` with given 847 width. The `Tensor` representing the column will have the shape of 848 [batch_size] + `shape`. 849 default_value: A single value compatible with `dtype` or an iterable of 850 values compatible with `dtype` which the column takes on during 851 `tf.Example` parsing if data is missing. A default value of `None` will 852 cause `tf.parse_example` to fail if an example does not contain this 853 column. If a single value is provided, the same value will be applied as 854 the default value for every item. If an iterable of values is provided, 855 the shape of the `default_value` should be equal to the given `shape`. 856 dtype: defines the type of values. Default value is `tf.float32`. Must be a 857 non-quantized, real integer or floating point type. 858 normalizer_fn: If not `None`, a function that can be used to normalize the 859 value of the tensor after `default_value` is applied for parsing. 860 Normalizer function takes the input `Tensor` as its argument, and returns 861 the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that 862 even though the most common use case of this function is normalization, it 863 can be used for any kind of Tensorflow transformations. 864 865 Returns: 866 A `_NumericColumn`. 867 868 Raises: 869 TypeError: if any dimension in shape is not an int 870 ValueError: if any dimension in shape is not a positive integer 871 TypeError: if `default_value` is an iterable but not compatible with `shape` 872 TypeError: if `default_value` is not compatible with `dtype`. 873 ValueError: if `dtype` is not convertible to `tf.float32`. 874 """ 875 shape = _check_shape(shape, key) 876 if not (dtype.is_integer or dtype.is_floating): 877 raise ValueError('dtype must be convertible to float. ' 878 'dtype: {}, key: {}'.format(dtype, key)) 879 default_value = _check_default_value(shape, default_value, dtype, key) 880 881 if normalizer_fn is not None and not callable(normalizer_fn): 882 raise TypeError( 883 'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn)) 884 885 return _NumericColumn( 886 key, 887 shape=shape, 888 default_value=default_value, 889 dtype=dtype, 890 normalizer_fn=normalizer_fn) 891 892 893@tf_export('feature_column.bucketized_column') 894def bucketized_column(source_column, boundaries): 895 """Represents discretized dense input. 896 897 Buckets include the left boundary, and exclude the right boundary. Namely, 898 `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`, 899 `[1., 2.)`, and `[2., +inf)`. 900 901 For example, if the inputs are 902 903 ```python 904 boundaries = [0, 10, 100] 905 input tensor = [[-5, 10000] 906 [150, 10] 907 [5, 100]] 908 ``` 909 910 then the output will be 911 912 ```python 913 output = [[0, 3] 914 [3, 2] 915 [1, 3]] 916 ``` 917 918 Example: 919 920 ```python 921 price = numeric_column('price') 922 bucketized_price = bucketized_column(price, boundaries=[...]) 923 columns = [bucketized_price, ...] 924 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 925 linear_prediction = linear_model(features, columns) 926 927 # or 928 columns = [bucketized_price, ...] 929 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 930 dense_tensor = input_layer(features, columns) 931 ``` 932 933 `bucketized_column` can also be crossed with another categorical column using 934 `crossed_column`: 935 936 ```python 937 price = numeric_column('price') 938 # bucketized_column converts numerical feature to a categorical one. 939 bucketized_price = bucketized_column(price, boundaries=[...]) 940 # 'keywords' is a string feature. 941 price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K) 942 columns = [price_x_keywords, ...] 943 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 944 linear_prediction = linear_model(features, columns) 945 ``` 946 947 Args: 948 source_column: A one-dimensional dense column which is generated with 949 `numeric_column`. 950 boundaries: A sorted list or tuple of floats specifying the boundaries. 951 952 Returns: 953 A `_BucketizedColumn`. 954 955 Raises: 956 ValueError: If `source_column` is not a numeric column, or if it is not 957 one-dimensional. 958 ValueError: If `boundaries` is not a sorted list or tuple. 959 """ 960 if not isinstance(source_column, _NumericColumn): 961 raise ValueError( 962 'source_column must be a column generated with numeric_column(). ' 963 'Given: {}'.format(source_column)) 964 if len(source_column.shape) > 1: 965 raise ValueError( 966 'source_column must be one-dimensional column. ' 967 'Given: {}'.format(source_column)) 968 if (not boundaries or 969 not (isinstance(boundaries, list) or isinstance(boundaries, tuple))): 970 raise ValueError('boundaries must be a sorted list.') 971 for i in range(len(boundaries) - 1): 972 if boundaries[i] >= boundaries[i + 1]: 973 raise ValueError('boundaries must be a sorted list.') 974 return _BucketizedColumn(source_column, tuple(boundaries)) 975 976 977def _assert_string_or_int(dtype, prefix): 978 if (dtype != dtypes.string) and (not dtype.is_integer): 979 raise ValueError( 980 '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype)) 981 982 983@tf_export('feature_column.categorical_column_with_hash_bucket') 984def categorical_column_with_hash_bucket(key, 985 hash_bucket_size, 986 dtype=dtypes.string): 987 """Represents sparse feature where ids are set by hashing. 988 989 Use this when your sparse features are in string or integer format, and you 990 want to distribute your inputs into a finite number of buckets by hashing. 991 output_id = Hash(input_feature_string) % bucket_size 992 993 For input dictionary `features`, `features[key]` is either `Tensor` or 994 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 995 and `''` for string. Note that these values are independent of the 996 `default_value` argument. 997 998 Example: 999 1000 ```python 1001 keywords = categorical_column_with_hash_bucket("keywords", 10K) 1002 columns = [keywords, ...] 1003 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1004 linear_prediction = linear_model(features, columns) 1005 1006 # or 1007 keywords_embedded = embedding_column(keywords, 16) 1008 columns = [keywords_embedded, ...] 1009 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1010 dense_tensor = input_layer(features, columns) 1011 ``` 1012 1013 Args: 1014 key: A unique string identifying the input feature. It is used as the 1015 column name and the dictionary key for feature parsing configs, feature 1016 `Tensor` objects, and feature columns. 1017 hash_bucket_size: An int > 1. The number of buckets. 1018 dtype: The type of features. Only string and integer types are supported. 1019 1020 Returns: 1021 A `_HashedCategoricalColumn`. 1022 1023 Raises: 1024 ValueError: `hash_bucket_size` is not greater than 1. 1025 ValueError: `dtype` is neither string nor integer. 1026 """ 1027 if hash_bucket_size is None: 1028 raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key)) 1029 1030 if hash_bucket_size < 1: 1031 raise ValueError('hash_bucket_size must be at least 1. ' 1032 'hash_bucket_size: {}, key: {}'.format( 1033 hash_bucket_size, key)) 1034 1035 _assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 1036 1037 return _HashedCategoricalColumn(key, hash_bucket_size, dtype) 1038 1039 1040@tf_export('feature_column.categorical_column_with_vocabulary_file') 1041def categorical_column_with_vocabulary_file(key, 1042 vocabulary_file, 1043 vocabulary_size=None, 1044 num_oov_buckets=0, 1045 default_value=None, 1046 dtype=dtypes.string): 1047 """A `_CategoricalColumn` with a vocabulary file. 1048 1049 Use this when your inputs are in string or integer format, and you have a 1050 vocabulary file that maps each value to an integer ID. By default, 1051 out-of-vocabulary values are ignored. Use either (but not both) of 1052 `num_oov_buckets` and `default_value` to specify how to include 1053 out-of-vocabulary values. 1054 1055 For input dictionary `features`, `features[key]` is either `Tensor` or 1056 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1057 and `''` for string. Note that these values are independent of the 1058 `default_value` argument. 1059 1060 Example with `num_oov_buckets`: 1061 File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state 1062 abbreviation. All inputs with values in that file are assigned an ID 0-49, 1063 corresponding to its line number. All other values are hashed and assigned an 1064 ID 50-54. 1065 1066 ```python 1067 states = categorical_column_with_vocabulary_file( 1068 key='states', vocabulary_file='/us/states.txt', vocabulary_size=50, 1069 num_oov_buckets=5) 1070 columns = [states, ...] 1071 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1072 linear_prediction = linear_model(features, columns) 1073 ``` 1074 1075 Example with `default_value`: 1076 File '/us/states.txt' contains 51 lines - the first line is 'XX', and the 1077 other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX' 1078 in input, and other values missing from the file, will be assigned ID 0. All 1079 others are assigned the corresponding line number 1-50. 1080 1081 ```python 1082 states = categorical_column_with_vocabulary_file( 1083 key='states', vocabulary_file='/us/states.txt', vocabulary_size=51, 1084 default_value=0) 1085 columns = [states, ...] 1086 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1087 linear_prediction, _, _ = linear_model(features, columns) 1088 ``` 1089 1090 And to make an embedding with either: 1091 1092 ```python 1093 columns = [embedding_column(states, 3),...] 1094 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1095 dense_tensor = input_layer(features, columns) 1096 ``` 1097 1098 Args: 1099 key: A unique string identifying the input feature. It is used as the 1100 column name and the dictionary key for feature parsing configs, feature 1101 `Tensor` objects, and feature columns. 1102 vocabulary_file: The vocabulary file name. 1103 vocabulary_size: Number of the elements in the vocabulary. This must be no 1104 greater than length of `vocabulary_file`, if less than length, later 1105 values are ignored. If None, it is set to the length of `vocabulary_file`. 1106 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 1107 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 1108 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of 1109 the input value. A positive `num_oov_buckets` can not be specified with 1110 `default_value`. 1111 default_value: The integer ID value to return for out-of-vocabulary feature 1112 values, defaults to `-1`. This can not be specified with a positive 1113 `num_oov_buckets`. 1114 dtype: The type of features. Only string and integer types are supported. 1115 1116 Returns: 1117 A `_CategoricalColumn` with a vocabulary file. 1118 1119 Raises: 1120 ValueError: `vocabulary_file` is missing or cannot be opened. 1121 ValueError: `vocabulary_size` is missing or < 1. 1122 ValueError: `num_oov_buckets` is a negative integer. 1123 ValueError: `num_oov_buckets` and `default_value` are both specified. 1124 ValueError: `dtype` is neither string nor integer. 1125 """ 1126 if not vocabulary_file: 1127 raise ValueError('Missing vocabulary_file in {}.'.format(key)) 1128 1129 if vocabulary_size is None: 1130 if not gfile.Exists(vocabulary_file): 1131 raise ValueError('vocabulary_file in {} does not exist.'.format(key)) 1132 1133 with gfile.GFile(vocabulary_file) as f: 1134 vocabulary_size = sum(1 for _ in f) 1135 logging.info( 1136 'vocabulary_size = %d in %s is inferred from the number of elements ' 1137 'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file) 1138 1139 # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`. 1140 if vocabulary_size < 1: 1141 raise ValueError('Invalid vocabulary_size in {}.'.format(key)) 1142 if num_oov_buckets: 1143 if default_value is not None: 1144 raise ValueError( 1145 'Can\'t specify both num_oov_buckets and default_value in {}.'.format( 1146 key)) 1147 if num_oov_buckets < 0: 1148 raise ValueError('Invalid num_oov_buckets {} in {}.'.format( 1149 num_oov_buckets, key)) 1150 _assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 1151 return _VocabularyFileCategoricalColumn( 1152 key=key, 1153 vocabulary_file=vocabulary_file, 1154 vocabulary_size=vocabulary_size, 1155 num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets, 1156 default_value=-1 if default_value is None else default_value, 1157 dtype=dtype) 1158 1159 1160@tf_export('feature_column.categorical_column_with_vocabulary_list') 1161def categorical_column_with_vocabulary_list( 1162 key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0): 1163 """A `_CategoricalColumn` with in-memory vocabulary. 1164 1165 Use this when your inputs are in string or integer format, and you have an 1166 in-memory vocabulary mapping each value to an integer ID. By default, 1167 out-of-vocabulary values are ignored. Use either (but not both) of 1168 `num_oov_buckets` and `default_value` to specify how to include 1169 out-of-vocabulary values. 1170 1171 For input dictionary `features`, `features[key]` is either `Tensor` or 1172 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1173 and `''` for string. Note that these values are independent of the 1174 `default_value` argument. 1175 1176 Example with `num_oov_buckets`: 1177 In the following example, each input in `vocabulary_list` is assigned an ID 1178 0-3 corresponding to its index (e.g., input 'B' produces output 2). All other 1179 inputs are hashed and assigned an ID 4-5. 1180 1181 ```python 1182 colors = categorical_column_with_vocabulary_list( 1183 key='colors', vocabulary_list=('R', 'G', 'B', 'Y'), 1184 num_oov_buckets=2) 1185 columns = [colors, ...] 1186 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1187 linear_prediction, _, _ = linear_model(features, columns) 1188 ``` 1189 1190 Example with `default_value`: 1191 In the following example, each input in `vocabulary_list` is assigned an ID 1192 0-4 corresponding to its index (e.g., input 'B' produces output 3). All other 1193 inputs are assigned `default_value` 0. 1194 1195 1196 ```python 1197 colors = categorical_column_with_vocabulary_list( 1198 key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0) 1199 columns = [colors, ...] 1200 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1201 linear_prediction, _, _ = linear_model(features, columns) 1202 ``` 1203 1204 And to make an embedding with either: 1205 1206 ```python 1207 columns = [embedding_column(colors, 3),...] 1208 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1209 dense_tensor = input_layer(features, columns) 1210 ``` 1211 1212 Args: 1213 key: A unique string identifying the input feature. It is used as the 1214 column name and the dictionary key for feature parsing configs, feature 1215 `Tensor` objects, and feature columns. 1216 vocabulary_list: An ordered iterable defining the vocabulary. Each feature 1217 is mapped to the index of its value (if present) in `vocabulary_list`. 1218 Must be castable to `dtype`. 1219 dtype: The type of features. Only string and integer types are supported. 1220 If `None`, it will be inferred from `vocabulary_list`. 1221 default_value: The integer ID value to return for out-of-vocabulary feature 1222 values, defaults to `-1`. This can not be specified with a positive 1223 `num_oov_buckets`. 1224 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 1225 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 1226 `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a 1227 hash of the input value. A positive `num_oov_buckets` can not be specified 1228 with `default_value`. 1229 1230 Returns: 1231 A `_CategoricalColumn` with in-memory vocabulary. 1232 1233 Raises: 1234 ValueError: if `vocabulary_list` is empty, or contains duplicate keys. 1235 ValueError: `num_oov_buckets` is a negative integer. 1236 ValueError: `num_oov_buckets` and `default_value` are both specified. 1237 ValueError: if `dtype` is not integer or string. 1238 """ 1239 if (vocabulary_list is None) or (len(vocabulary_list) < 1): 1240 raise ValueError( 1241 'vocabulary_list {} must be non-empty, column_name: {}'.format( 1242 vocabulary_list, key)) 1243 if len(set(vocabulary_list)) != len(vocabulary_list): 1244 raise ValueError( 1245 'Duplicate keys in vocabulary_list {}, column_name: {}'.format( 1246 vocabulary_list, key)) 1247 vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype) 1248 if num_oov_buckets: 1249 if default_value != -1: 1250 raise ValueError( 1251 'Can\'t specify both num_oov_buckets and default_value in {}.'.format( 1252 key)) 1253 if num_oov_buckets < 0: 1254 raise ValueError('Invalid num_oov_buckets {} in {}.'.format( 1255 num_oov_buckets, key)) 1256 _assert_string_or_int( 1257 vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key)) 1258 if dtype is None: 1259 dtype = vocabulary_dtype 1260 elif dtype.is_integer != vocabulary_dtype.is_integer: 1261 raise ValueError( 1262 'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format( 1263 dtype, vocabulary_dtype, key)) 1264 _assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 1265 1266 return _VocabularyListCategoricalColumn( 1267 key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype, 1268 default_value=default_value, num_oov_buckets=num_oov_buckets) 1269 1270 1271@tf_export('feature_column.categorical_column_with_identity') 1272def categorical_column_with_identity(key, num_buckets, default_value=None): 1273 """A `_CategoricalColumn` that returns identity values. 1274 1275 Use this when your inputs are integers in the range `[0, num_buckets)`, and 1276 you want to use the input value itself as the categorical ID. Values outside 1277 this range will result in `default_value` if specified, otherwise it will 1278 fail. 1279 1280 Typically, this is used for contiguous ranges of integer indexes, but 1281 it doesn't have to be. This might be inefficient, however, if many of IDs 1282 are unused. Consider `categorical_column_with_hash_bucket` in that case. 1283 1284 For input dictionary `features`, `features[key]` is either `Tensor` or 1285 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1286 and `''` for string. Note that these values are independent of the 1287 `default_value` argument. 1288 1289 In the following examples, each input in the range `[0, 1000000)` is assigned 1290 the same value. All other inputs are assigned `default_value` 0. Note that a 1291 literal 0 in inputs will result in the same default ID. 1292 1293 Linear model: 1294 1295 ```python 1296 video_id = categorical_column_with_identity( 1297 key='video_id', num_buckets=1000000, default_value=0) 1298 columns = [video_id, ...] 1299 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1300 linear_prediction, _, _ = linear_model(features, columns) 1301 ``` 1302 1303 Embedding for a DNN model: 1304 1305 ```python 1306 columns = [embedding_column(video_id, 9),...] 1307 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1308 dense_tensor = input_layer(features, columns) 1309 ``` 1310 1311 Args: 1312 key: A unique string identifying the input feature. It is used as the 1313 column name and the dictionary key for feature parsing configs, feature 1314 `Tensor` objects, and feature columns. 1315 num_buckets: Range of inputs and outputs is `[0, num_buckets)`. 1316 default_value: If `None`, this column's graph operations will fail for 1317 out-of-range inputs. Otherwise, this value must be in the range 1318 `[0, num_buckets)`, and will replace inputs in that range. 1319 1320 Returns: 1321 A `_CategoricalColumn` that returns identity values. 1322 1323 Raises: 1324 ValueError: if `num_buckets` is less than one. 1325 ValueError: if `default_value` is not in range `[0, num_buckets)`. 1326 """ 1327 if num_buckets < 1: 1328 raise ValueError( 1329 'num_buckets {} < 1, column_name {}'.format(num_buckets, key)) 1330 if (default_value is not None) and ( 1331 (default_value < 0) or (default_value >= num_buckets)): 1332 raise ValueError( 1333 'default_value {} not in range [0, {}), column_name {}'.format( 1334 default_value, num_buckets, key)) 1335 return _IdentityCategoricalColumn( 1336 key=key, num_buckets=num_buckets, default_value=default_value) 1337 1338 1339@tf_export('feature_column.indicator_column') 1340def indicator_column(categorical_column): 1341 """Represents multi-hot representation of given categorical column. 1342 1343 Used to wrap any `categorical_column_*` (e.g., to feed to DNN). Use 1344 `embedding_column` if the inputs are sparse. 1345 1346 ```python 1347 name = indicator_column(categorical_column_with_vocabulary_list( 1348 'name', ['bob', 'george', 'wanda']) 1349 columns = [name, ...] 1350 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1351 dense_tensor = input_layer(features, columns) 1352 1353 dense_tensor == [[1, 0, 0]] # If "name" bytes_list is ["bob"] 1354 dense_tensor == [[1, 0, 1]] # If "name" bytes_list is ["bob", "wanda"] 1355 dense_tensor == [[2, 0, 0]] # If "name" bytes_list is ["bob", "bob"] 1356 ``` 1357 1358 Args: 1359 categorical_column: A `_CategoricalColumn` which is created by 1360 `categorical_column_with_*` or `crossed_column` functions. 1361 1362 Returns: 1363 An `_IndicatorColumn`. 1364 """ 1365 return _IndicatorColumn(categorical_column) 1366 1367 1368@tf_export('feature_column.weighted_categorical_column') 1369def weighted_categorical_column( 1370 categorical_column, weight_feature_key, dtype=dtypes.float32): 1371 """Applies weight values to a `_CategoricalColumn`. 1372 1373 Use this when each of your sparse inputs has both an ID and a value. For 1374 example, if you're representing text documents as a collection of word 1375 frequencies, you can provide 2 parallel sparse input features ('terms' and 1376 'frequencies' below). 1377 1378 Example: 1379 1380 Input `tf.Example` objects: 1381 1382 ```proto 1383 [ 1384 features { 1385 feature { 1386 key: "terms" 1387 value {bytes_list {value: "very" value: "model"}} 1388 } 1389 feature { 1390 key: "frequencies" 1391 value {float_list {value: 0.3 value: 0.1}} 1392 } 1393 }, 1394 features { 1395 feature { 1396 key: "terms" 1397 value {bytes_list {value: "when" value: "course" value: "human"}} 1398 } 1399 feature { 1400 key: "frequencies" 1401 value {float_list {value: 0.4 value: 0.1 value: 0.2}} 1402 } 1403 } 1404 ] 1405 ``` 1406 1407 ```python 1408 categorical_column = categorical_column_with_hash_bucket( 1409 column_name='terms', hash_bucket_size=1000) 1410 weighted_column = weighted_categorical_column( 1411 categorical_column=categorical_column, weight_feature_key='frequencies') 1412 columns = [weighted_column, ...] 1413 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1414 linear_prediction, _, _ = linear_model(features, columns) 1415 ``` 1416 1417 This assumes the input dictionary contains a `SparseTensor` for key 1418 'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have 1419 the same indices and dense shape. 1420 1421 Args: 1422 categorical_column: A `_CategoricalColumn` created by 1423 `categorical_column_with_*` functions. 1424 weight_feature_key: String key for weight values. 1425 dtype: Type of weights, such as `tf.float32`. Only float and integer weights 1426 are supported. 1427 1428 Returns: 1429 A `_CategoricalColumn` composed of two sparse features: one represents id, 1430 the other represents weight (value) of the id feature in that example. 1431 1432 Raises: 1433 ValueError: if `dtype` is not convertible to float. 1434 """ 1435 if (dtype is None) or not (dtype.is_integer or dtype.is_floating): 1436 raise ValueError('dtype {} is not convertible to float.'.format(dtype)) 1437 return _WeightedCategoricalColumn( 1438 categorical_column=categorical_column, 1439 weight_feature_key=weight_feature_key, 1440 dtype=dtype) 1441 1442 1443@tf_export('feature_column.crossed_column') 1444def crossed_column(keys, hash_bucket_size, hash_key=None): 1445 """Returns a column for performing crosses of categorical features. 1446 1447 Crossed features will be hashed according to `hash_bucket_size`. Conceptually, 1448 the transformation can be thought of as: 1449 Hash(cartesian product of features) % `hash_bucket_size` 1450 1451 For example, if the input features are: 1452 1453 * SparseTensor referred by first key: 1454 1455 ```python 1456 shape = [2, 2] 1457 { 1458 [0, 0]: "a" 1459 [1, 0]: "b" 1460 [1, 1]: "c" 1461 } 1462 ``` 1463 1464 * SparseTensor referred by second key: 1465 1466 ```python 1467 shape = [2, 1] 1468 { 1469 [0, 0]: "d" 1470 [1, 0]: "e" 1471 } 1472 ``` 1473 1474 then crossed feature will look like: 1475 1476 ```python 1477 shape = [2, 2] 1478 { 1479 [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size 1480 [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size 1481 [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size 1482 } 1483 ``` 1484 1485 Here is an example to create a linear model with crosses of string features: 1486 1487 ```python 1488 keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K) 1489 columns = [keywords_x_doc_terms, ...] 1490 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1491 linear_prediction = linear_model(features, columns) 1492 ``` 1493 1494 You could also use vocabulary lookup before crossing: 1495 1496 ```python 1497 keywords = categorical_column_with_vocabulary_file( 1498 'keywords', '/path/to/vocabulary/file', vocabulary_size=1K) 1499 keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K) 1500 columns = [keywords_x_doc_terms, ...] 1501 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1502 linear_prediction = linear_model(features, columns) 1503 ``` 1504 1505 If an input feature is of numeric type, you can use 1506 `categorical_column_with_identity`, or `bucketized_column`, as in the example: 1507 1508 ```python 1509 # vertical_id is an integer categorical feature. 1510 vertical_id = categorical_column_with_identity('vertical_id', 10K) 1511 price = numeric_column('price') 1512 # bucketized_column converts numerical feature to a categorical one. 1513 bucketized_price = bucketized_column(price, boundaries=[...]) 1514 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K) 1515 columns = [vertical_id_x_price, ...] 1516 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1517 linear_prediction = linear_model(features, columns) 1518 ``` 1519 1520 To use crossed column in DNN model, you need to add it in an embedding column 1521 as in this example: 1522 1523 ```python 1524 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K) 1525 vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10) 1526 dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...]) 1527 ``` 1528 1529 Args: 1530 keys: An iterable identifying the features to be crossed. Each element can 1531 be either: 1532 * string: Will use the corresponding feature which must be of string type. 1533 * `_CategoricalColumn`: Will use the transformed tensor produced by this 1534 column. Does not support hashed categorical column. 1535 hash_bucket_size: An int > 1. The number of buckets. 1536 hash_key: Specify the hash_key that will be used by the `FingerprintCat64` 1537 function to combine the crosses fingerprints on SparseCrossOp (optional). 1538 1539 Returns: 1540 A `_CrossedColumn`. 1541 1542 Raises: 1543 ValueError: If `len(keys) < 2`. 1544 ValueError: If any of the keys is neither a string nor `_CategoricalColumn`. 1545 ValueError: If any of the keys is `_HashedCategoricalColumn`. 1546 ValueError: If `hash_bucket_size < 1`. 1547 """ 1548 if not hash_bucket_size or hash_bucket_size < 1: 1549 raise ValueError('hash_bucket_size must be > 1. ' 1550 'hash_bucket_size: {}'.format(hash_bucket_size)) 1551 if not keys or len(keys) < 2: 1552 raise ValueError( 1553 'keys must be a list with length > 1. Given: {}'.format(keys)) 1554 for key in keys: 1555 if (not isinstance(key, six.string_types) and 1556 not isinstance(key, _CategoricalColumn)): 1557 raise ValueError( 1558 'Unsupported key type. All keys must be either string, or ' 1559 'categorical column except _HashedCategoricalColumn. ' 1560 'Given: {}'.format(key)) 1561 if isinstance(key, _HashedCategoricalColumn): 1562 raise ValueError( 1563 'categorical_column_with_hash_bucket is not supported for crossing. ' 1564 'Hashing before crossing will increase probability of collision. ' 1565 'Instead, use the feature name as a string. Given: {}'.format(key)) 1566 return _CrossedColumn( 1567 keys=tuple(keys), hash_bucket_size=hash_bucket_size, 1568 hash_key=hash_key) 1569 1570 1571class _FeatureColumn(object): 1572 """Represents a feature column abstraction. 1573 1574 WARNING: Do not subclass this layer unless you know what you are doing: 1575 the API is subject to future changes. 1576 1577 To distinguish the concept of a feature family and a specific binary feature 1578 within a family, we refer to a feature family like "country" as a feature 1579 column. Following is an example feature in a `tf.Example` format: 1580 {key: "country", value: [ "US" ]} 1581 In this example the value of feature is "US" and "country" refers to the 1582 column of the feature. 1583 1584 This class is an abstract class. User should not create instances of this. 1585 """ 1586 __metaclass__ = abc.ABCMeta 1587 1588 @abc.abstractproperty 1589 def name(self): 1590 """Returns string. Used for naming and for name_scope.""" 1591 pass 1592 1593 @property 1594 def _var_scope_name(self): 1595 """Returns string. Used for variable_scope. Defaults to self.name.""" 1596 return self.name 1597 1598 @abc.abstractmethod 1599 def _transform_feature(self, inputs): 1600 """Returns intermediate representation (usually a `Tensor`). 1601 1602 Uses `inputs` to create an intermediate representation (usually a `Tensor`) 1603 that other feature columns can use. 1604 1605 Example usage of `inputs`: 1606 Let's say a Feature column depends on raw feature ('raw') and another 1607 `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will 1608 be used as follows: 1609 1610 ```python 1611 raw_tensor = inputs.get('raw') 1612 fc_tensor = inputs.get(input_fc) 1613 ``` 1614 1615 Args: 1616 inputs: A `_LazyBuilder` object to access inputs. 1617 1618 Returns: 1619 Transformed feature `Tensor`. 1620 """ 1621 pass 1622 1623 @abc.abstractproperty 1624 def _parse_example_spec(self): 1625 """Returns a `tf.Example` parsing spec as dict. 1626 1627 It is used for get_parsing_spec for `tf.parse_example`. Returned spec is a 1628 dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other 1629 supported objects. Please check documentation of ${tf.parse_example} for all 1630 supported spec objects. 1631 1632 Let's say a Feature column depends on raw feature ('raw') and another 1633 `_FeatureColumn` (input_fc). One possible implementation of 1634 _parse_example_spec is as follows: 1635 1636 ```python 1637 spec = {'raw': tf.FixedLenFeature(...)} 1638 spec.update(input_fc._parse_example_spec) 1639 return spec 1640 ``` 1641 """ 1642 pass 1643 1644 1645class _DenseColumn(_FeatureColumn): 1646 """Represents a column which can be represented as `Tensor`. 1647 1648 WARNING: Do not subclass this layer unless you know what you are doing: 1649 the API is subject to future changes. 1650 1651 Some examples of this type are: numeric_column, embedding_column, 1652 indicator_column. 1653 """ 1654 1655 __metaclass__ = abc.ABCMeta 1656 1657 @abc.abstractproperty 1658 def _variable_shape(self): 1659 """`TensorShape` of `_get_dense_tensor`, without batch dimension.""" 1660 pass 1661 1662 @abc.abstractmethod 1663 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 1664 """Returns a `Tensor`. 1665 1666 The output of this function will be used by model-builder-functions. For 1667 example the pseudo code of `input_layer` will be like: 1668 1669 ```python 1670 def input_layer(features, feature_columns, ...): 1671 outputs = [fc._get_dense_tensor(...) for fc in feature_columns] 1672 return tf.concat(outputs) 1673 ``` 1674 1675 Args: 1676 inputs: A `_LazyBuilder` object to access inputs. 1677 weight_collections: List of graph collections to which Variables (if any 1678 will be created) are added. 1679 trainable: If `True` also add variables to the graph collection 1680 `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.Variable}). 1681 1682 Returns: 1683 `Tensor` of shape [batch_size] + `_variable_shape`. 1684 """ 1685 pass 1686 1687 1688def _create_weighted_sum( 1689 column, 1690 builder, 1691 units, 1692 sparse_combiner, 1693 weight_collections, 1694 trainable): 1695 """Creates a weighted sum for a dense or sparse column for linear_model.""" 1696 if isinstance(column, _CategoricalColumn): 1697 return _create_categorical_column_weighted_sum( 1698 column=column, 1699 builder=builder, 1700 units=units, 1701 sparse_combiner=sparse_combiner, 1702 weight_collections=weight_collections, 1703 trainable=trainable) 1704 else: 1705 return _create_dense_column_weighted_sum( 1706 column=column, 1707 builder=builder, 1708 units=units, 1709 weight_collections=weight_collections, 1710 trainable=trainable) 1711 1712 1713def _create_dense_column_weighted_sum( 1714 column, builder, units, weight_collections, trainable): 1715 """Create a weighted sum of a dense column for linear_model.""" 1716 tensor = column._get_dense_tensor( # pylint: disable=protected-access 1717 builder, 1718 weight_collections=weight_collections, 1719 trainable=trainable) 1720 num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access 1721 batch_size = array_ops.shape(tensor)[0] 1722 tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements)) 1723 weight = variable_scope.get_variable( 1724 name='weights', 1725 shape=[num_elements, units], 1726 initializer=init_ops.zeros_initializer(), 1727 trainable=trainable, 1728 collections=weight_collections) 1729 return math_ops.matmul(tensor, weight, name='weighted_sum') 1730 1731 1732class _CategoricalColumn(_FeatureColumn): 1733 """Represents a categorical feature. 1734 1735 WARNING: Do not subclass this layer unless you know what you are doing: 1736 the API is subject to future changes. 1737 1738 A categorical feature typically handled with a ${tf.SparseTensor} of IDs. 1739 """ 1740 __metaclass__ = abc.ABCMeta 1741 1742 IdWeightPair = collections.namedtuple( # pylint: disable=invalid-name 1743 'IdWeightPair', ['id_tensor', 'weight_tensor']) 1744 1745 @abc.abstractproperty 1746 def _num_buckets(self): 1747 """Returns number of buckets in this sparse feature.""" 1748 pass 1749 1750 @abc.abstractmethod 1751 def _get_sparse_tensors(self, 1752 inputs, 1753 weight_collections=None, 1754 trainable=None): 1755 """Returns an IdWeightPair. 1756 1757 `IdWeightPair` is a pair of `SparseTensor`s which represents ids and 1758 weights. 1759 1760 `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets` 1761 `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a 1762 `SparseTensor` of `float` or `None` to indicate all weights should be 1763 taken to be 1. If specified, `weight_tensor` must have exactly the same 1764 shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing 1765 output of a `VarLenFeature` which is a ragged matrix. 1766 1767 Args: 1768 inputs: A `LazyBuilder` as a cache to get input tensors required to 1769 create `IdWeightPair`. 1770 weight_collections: List of graph collections to which variables (if any 1771 will be created) are added. 1772 trainable: If `True` also add variables to the graph collection 1773 `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.get_variable}). 1774 """ 1775 pass 1776 1777 1778def _create_categorical_column_weighted_sum( 1779 column, builder, units, sparse_combiner, weight_collections, trainable): 1780 """Create a weighted sum of a categorical column for linear_model.""" 1781 sparse_tensors = column._get_sparse_tensors( # pylint: disable=protected-access 1782 builder, 1783 weight_collections=weight_collections, 1784 trainable=trainable) 1785 id_tensor = sparse_ops.sparse_reshape(sparse_tensors.id_tensor, [ 1786 array_ops.shape(sparse_tensors.id_tensor)[0], -1 1787 ]) 1788 weight_tensor = sparse_tensors.weight_tensor 1789 if weight_tensor is not None: 1790 weight_tensor = sparse_ops.sparse_reshape( 1791 weight_tensor, [array_ops.shape(weight_tensor)[0], -1]) 1792 1793 weight = variable_scope.get_variable( 1794 name='weights', 1795 shape=(column._num_buckets, units), # pylint: disable=protected-access 1796 initializer=init_ops.zeros_initializer(), 1797 trainable=trainable, 1798 collections=weight_collections) 1799 return _safe_embedding_lookup_sparse( 1800 weight, 1801 id_tensor, 1802 sparse_weights=weight_tensor, 1803 combiner=sparse_combiner, 1804 name='weighted_sum') 1805 1806 1807class _LazyBuilder(object): 1808 """Handles caching of transformations while building the model. 1809 1810 `_FeatureColumn` specifies how to digest an input column to the network. Some 1811 feature columns require data transformations. This class caches those 1812 transformations. 1813 1814 Some features may be used in more than one place. For example, one can use a 1815 bucketized feature by itself and a cross with it. In that case we 1816 should create only one bucketization op instead of creating ops for each 1817 feature column separately. To handle re-use of transformed columns, 1818 `_LazyBuilder` caches all previously transformed columns. 1819 1820 Example: 1821 We're trying to use the following `_FeatureColumn`s: 1822 1823 ```python 1824 bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...) 1825 keywords = fc.categorical_column_with_hash_buckets("keywords", ...) 1826 age_X_keywords = fc.crossed_column([bucketized_age, "keywords"]) 1827 ... = linear_model(features, 1828 [bucketized_age, keywords, age_X_keywords] 1829 ``` 1830 1831 If we transform each column independently, then we'll get duplication of 1832 bucketization (one for cross, one for bucketization itself). 1833 The `_LazyBuilder` eliminates this duplication. 1834 """ 1835 1836 def __init__(self, features): 1837 """Creates a `_LazyBuilder`. 1838 1839 Args: 1840 features: A mapping from feature column to objects that are `Tensor` or 1841 `SparseTensor`, or can be converted to same via 1842 `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key 1843 signifies a base feature (not-transformed). A `_FeatureColumn` key 1844 means that this `Tensor` is the output of an existing `_FeatureColumn` 1845 which can be reused. 1846 """ 1847 self._features = features.copy() 1848 self._feature_tensors = {} 1849 1850 def get(self, key): 1851 """Returns a `Tensor` for the given key. 1852 1853 A `str` key is used to access a base feature (not-transformed). When a 1854 `_FeatureColumn` is passed, the transformed feature is returned if it 1855 already exists, otherwise the given `_FeatureColumn` is asked to provide its 1856 transformed output, which is then cached. 1857 1858 Args: 1859 key: a `str` or a `_FeatureColumn`. 1860 1861 Returns: 1862 The transformed `Tensor` corresponding to the `key`. 1863 1864 Raises: 1865 ValueError: if key is not found or a transformed `Tensor` cannot be 1866 computed. 1867 """ 1868 if key in self._feature_tensors: 1869 # FeatureColumn is already transformed or converted. 1870 return self._feature_tensors[key] 1871 1872 if key in self._features: 1873 feature_tensor = self._get_raw_feature_as_tensor(key) 1874 self._feature_tensors[key] = feature_tensor 1875 return feature_tensor 1876 1877 if not isinstance(key, (str, _FeatureColumn)): 1878 raise TypeError('"key" must be either a "str" or "_FeatureColumn". ' 1879 'Provided: {}'.format(key)) 1880 1881 if not isinstance(key, _FeatureColumn): 1882 raise ValueError('Feature {} is not in features dictionary.'.format(key)) 1883 1884 column = key 1885 logging.debug('Transforming feature_column %s.', column) 1886 transformed = column._transform_feature(self) # pylint: disable=protected-access 1887 if transformed is None: 1888 raise ValueError('Column {} is not supported.'.format(column.name)) 1889 self._feature_tensors[column] = transformed 1890 return transformed 1891 1892 def _get_raw_feature_as_tensor(self, key): 1893 """Gets the raw_feature (keyed by `key`) as `tensor`. 1894 1895 The raw feature is converted to (sparse) tensor and maybe expand dim. 1896 1897 For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if 1898 the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will 1899 error out as it is not supported. 1900 1901 Args: 1902 key: A `str` key to access the raw feature. 1903 1904 Returns: 1905 A `Tensor` or `SparseTensor`. 1906 1907 Raises: 1908 ValueError: if the raw feature has rank 0. 1909 """ 1910 raw_feature = self._features[key] 1911 feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 1912 raw_feature) 1913 1914 def expand_dims(input_tensor): 1915 # Input_tensor must have rank 1. 1916 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 1917 return sparse_ops.sparse_reshape( 1918 input_tensor, [array_ops.shape(input_tensor)[0], -1]) 1919 else: 1920 return array_ops.expand_dims(input_tensor, -1) 1921 1922 rank = feature_tensor.get_shape().ndims 1923 if rank is not None: 1924 if rank == 0: 1925 raise ValueError( 1926 'Feature (key: {}) cannot have rank 0. Give: {}'.format( 1927 key, feature_tensor)) 1928 return feature_tensor if rank != 1 else expand_dims(feature_tensor) 1929 1930 # Handle dynamic rank. 1931 with ops.control_dependencies([ 1932 check_ops.assert_positive( 1933 array_ops.rank(feature_tensor), 1934 message='Feature (key: {}) cannot have rank 0. Given: {}'.format( 1935 key, feature_tensor))]): 1936 return control_flow_ops.cond( 1937 math_ops.equal(1, array_ops.rank(feature_tensor)), 1938 lambda: expand_dims(feature_tensor), 1939 lambda: feature_tensor) 1940 1941 1942# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py 1943def _shape_offsets(shape): 1944 """Returns moving offset for each dimension given shape.""" 1945 offsets = [] 1946 for dim in reversed(shape): 1947 if offsets: 1948 offsets.append(dim * offsets[-1]) 1949 else: 1950 offsets.append(dim) 1951 offsets.reverse() 1952 return offsets 1953 1954 1955# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py 1956def _to_sparse_input(input_tensor, ignore_value=None): 1957 """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells. 1958 1959 If `input_tensor` is already a `SparseTensor`, just return it. 1960 1961 Args: 1962 input_tensor: A string or integer `Tensor`. 1963 ignore_value: Entries in `dense_tensor` equal to this value will be 1964 absent from the resulting `SparseTensor`. If `None`, default value of 1965 `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`). 1966 1967 Returns: 1968 A `SparseTensor` with the same shape as `input_tensor`. 1969 1970 Raises: 1971 ValueError: when `input_tensor`'s rank is `None`. 1972 """ 1973 input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 1974 input_tensor) 1975 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 1976 return input_tensor 1977 with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)): 1978 if ignore_value is None: 1979 if input_tensor.dtype == dtypes.string: 1980 # Exception due to TF strings are converted to numpy objects by default. 1981 ignore_value = '' 1982 elif input_tensor.dtype.is_integer: 1983 ignore_value = -1 # -1 has a special meaning of missing feature 1984 else: 1985 # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is 1986 # constructing a new numpy object of the given type, which yields the 1987 # default value for that type. 1988 ignore_value = input_tensor.dtype.as_numpy_dtype() 1989 ignore_value = math_ops.cast( 1990 ignore_value, input_tensor.dtype, name='ignore_value') 1991 indices = array_ops.where( 1992 math_ops.not_equal(input_tensor, ignore_value), name='indices') 1993 return sparse_tensor_lib.SparseTensor( 1994 indices=indices, 1995 values=array_ops.gather_nd(input_tensor, indices, name='values'), 1996 dense_shape=array_ops.shape( 1997 input_tensor, out_type=dtypes.int64, name='dense_shape')) 1998 1999 2000def _clean_feature_columns(feature_columns): 2001 """Verifies and normalizes `feature_columns` input.""" 2002 if isinstance(feature_columns, _FeatureColumn): 2003 feature_columns = [feature_columns] 2004 2005 if isinstance(feature_columns, collections.Iterator): 2006 feature_columns = list(feature_columns) 2007 2008 if isinstance(feature_columns, dict): 2009 raise ValueError('Expected feature_columns to be iterable, found dict.') 2010 2011 for column in feature_columns: 2012 if not isinstance(column, _FeatureColumn): 2013 raise ValueError('Items of feature_columns must be a _FeatureColumn. ' 2014 'Given (type {}): {}.'.format(type(column), column)) 2015 if not feature_columns: 2016 raise ValueError('feature_columns must not be empty.') 2017 name_to_column = dict() 2018 for column in feature_columns: 2019 if column.name in name_to_column: 2020 raise ValueError('Duplicate feature column name found for columns: {} ' 2021 'and {}. This usually means that these columns refer to ' 2022 'same base feature. Either one must be discarded or a ' 2023 'duplicated but renamed item must be inserted in ' 2024 'features dict.'.format(column, 2025 name_to_column[column.name])) 2026 name_to_column[column.name] = column 2027 2028 return feature_columns 2029 2030 2031class _NumericColumn(_DenseColumn, 2032 collections.namedtuple('_NumericColumn', [ 2033 'key', 'shape', 'default_value', 'dtype', 2034 'normalizer_fn' 2035 ])): 2036 """see `numeric_column`.""" 2037 2038 @property 2039 def name(self): 2040 return self.key 2041 2042 @property 2043 def _parse_example_spec(self): 2044 return { 2045 self.key: 2046 parsing_ops.FixedLenFeature(self.shape, self.dtype, 2047 self.default_value) 2048 } 2049 2050 def _transform_feature(self, inputs): 2051 input_tensor = inputs.get(self.key) 2052 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2053 raise ValueError( 2054 'The corresponding Tensor of numerical column must be a Tensor. ' 2055 'SparseTensor is not supported. key: {}'.format(self.key)) 2056 if self.normalizer_fn is not None: 2057 input_tensor = self.normalizer_fn(input_tensor) 2058 return math_ops.to_float(input_tensor) 2059 2060 @property 2061 def _variable_shape(self): 2062 return tensor_shape.TensorShape(self.shape) 2063 2064 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2065 """Returns dense `Tensor` representing numeric feature. 2066 2067 Args: 2068 inputs: A `_LazyBuilder` object to access inputs. 2069 weight_collections: Unused `weight_collections` since no variables are 2070 created in this function. 2071 trainable: Unused `trainable` bool since no variables are created in 2072 this function. 2073 2074 Returns: 2075 Dense `Tensor` created within `_transform_feature`. 2076 """ 2077 # Do nothing with weight_collections and trainable since no variables are 2078 # created in this function. 2079 del weight_collections 2080 del trainable 2081 # Feature has been already transformed. Return the intermediate 2082 # representation created by _transform_feature. 2083 return inputs.get(self) 2084 2085 2086class _BucketizedColumn(_DenseColumn, _CategoricalColumn, 2087 collections.namedtuple('_BucketizedColumn', [ 2088 'source_column', 'boundaries'])): 2089 """See `bucketized_column`.""" 2090 2091 @property 2092 def name(self): 2093 return '{}_bucketized'.format(self.source_column.name) 2094 2095 @property 2096 def _parse_example_spec(self): 2097 return self.source_column._parse_example_spec # pylint: disable=protected-access 2098 2099 def _transform_feature(self, inputs): 2100 source_tensor = inputs.get(self.source_column) 2101 return math_ops._bucketize( # pylint: disable=protected-access 2102 source_tensor, 2103 boundaries=self.boundaries) 2104 2105 @property 2106 def _variable_shape(self): 2107 return tensor_shape.TensorShape( 2108 tuple(self.source_column.shape) + (len(self.boundaries) + 1,)) 2109 2110 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2111 del weight_collections 2112 del trainable 2113 input_tensor = inputs.get(self) 2114 return array_ops.one_hot( 2115 indices=math_ops.to_int64(input_tensor), 2116 depth=len(self.boundaries) + 1, 2117 on_value=1., 2118 off_value=0.) 2119 2120 @property 2121 def _num_buckets(self): 2122 # By construction, source_column is always one-dimensional. 2123 return (len(self.boundaries) + 1) * self.source_column.shape[0] 2124 2125 def _get_sparse_tensors(self, inputs, weight_collections=None, 2126 trainable=None): 2127 input_tensor = inputs.get(self) 2128 batch_size = array_ops.shape(input_tensor)[0] 2129 # By construction, source_column is always one-dimensional. 2130 source_dimension = self.source_column.shape[0] 2131 2132 i1 = array_ops.reshape( 2133 array_ops.tile( 2134 array_ops.expand_dims(math_ops.range(0, batch_size), 1), 2135 [1, source_dimension]), 2136 (-1,)) 2137 i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size]) 2138 # Flatten the bucket indices and unique them across dimensions 2139 # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets 2140 bucket_indices = ( 2141 array_ops.reshape(input_tensor, (-1,)) + 2142 (len(self.boundaries) + 1) * i2) 2143 2144 indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2)))) 2145 dense_shape = math_ops.to_int64(array_ops.stack( 2146 [batch_size, source_dimension])) 2147 sparse_tensor = sparse_tensor_lib.SparseTensor( 2148 indices=indices, 2149 values=bucket_indices, 2150 dense_shape=dense_shape) 2151 return _CategoricalColumn.IdWeightPair(sparse_tensor, None) 2152 2153 2154class _EmbeddingColumn( 2155 _DenseColumn, 2156 collections.namedtuple('_EmbeddingColumn', ( 2157 'categorical_column', 'dimension', 'combiner', 'initializer', 2158 'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable' 2159 ))): 2160 """See `embedding_column`.""" 2161 2162 @property 2163 def name(self): 2164 if not hasattr(self, '_name'): 2165 self._name = '{}_embedding'.format(self.categorical_column.name) 2166 return self._name 2167 2168 @property 2169 def _parse_example_spec(self): 2170 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 2171 2172 def _transform_feature(self, inputs): 2173 return inputs.get(self.categorical_column) 2174 2175 @property 2176 def _variable_shape(self): 2177 if not hasattr(self, '_shape'): 2178 self._shape = tensor_shape.vector(self.dimension) 2179 return self._shape 2180 2181 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2182 # Get sparse IDs and weights. 2183 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access 2184 inputs, weight_collections=weight_collections, trainable=trainable) 2185 sparse_ids = sparse_tensors.id_tensor 2186 sparse_weights = sparse_tensors.weight_tensor 2187 2188 embedding_shape = (self.categorical_column._num_buckets, self.dimension) # pylint: disable=protected-access 2189 embedding_weights = variable_scope.get_variable( 2190 name='embedding_weights', 2191 shape=embedding_shape, 2192 dtype=dtypes.float32, 2193 initializer=self.initializer, 2194 trainable=self.trainable and trainable, 2195 collections=weight_collections) 2196 if self.ckpt_to_load_from is not None: 2197 to_restore = embedding_weights 2198 if isinstance(to_restore, variables.PartitionedVariable): 2199 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access 2200 checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, { 2201 self.tensor_name_in_ckpt: to_restore 2202 }) 2203 2204 # Return embedding lookup result. 2205 return _safe_embedding_lookup_sparse( 2206 embedding_weights=embedding_weights, 2207 sparse_ids=sparse_ids, 2208 sparse_weights=sparse_weights, 2209 combiner=self.combiner, 2210 name='%s_weights' % self.name, 2211 max_norm=self.max_norm) 2212 2213 2214class _SharedEmbeddingColumn( 2215 _DenseColumn, 2216 collections.namedtuple('_SharedEmbeddingColumn', ( 2217 'categorical_column', 'dimension', 'combiner', 'initializer', 2218 'shared_embedding_collection_name', 'ckpt_to_load_from', 2219 'tensor_name_in_ckpt', 'max_norm', 'trainable' 2220 ))): 2221 """See `embedding_column`.""" 2222 2223 @property 2224 def name(self): 2225 if not hasattr(self, '_name'): 2226 self._name = '{}_shared_embedding'.format(self.categorical_column.name) 2227 return self._name 2228 2229 @property 2230 def _var_scope_name(self): 2231 return self.shared_embedding_collection_name 2232 2233 @property 2234 def _parse_example_spec(self): 2235 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 2236 2237 def _transform_feature(self, inputs): 2238 return inputs.get(self.categorical_column) 2239 2240 @property 2241 def _variable_shape(self): 2242 if not hasattr(self, '_shape'): 2243 self._shape = tensor_shape.vector(self.dimension) 2244 return self._shape 2245 2246 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2247 # This method is called from a variable_scope with name _var_scope_name, 2248 # which is shared among all shared embeddings. Open a name_scope here, so 2249 # that the ops for different columns have distinct names. 2250 with ops.name_scope(None, default_name=self.name): 2251 # Get sparse IDs and weights. 2252 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access 2253 inputs, weight_collections=weight_collections, trainable=trainable) 2254 sparse_ids = sparse_tensors.id_tensor 2255 sparse_weights = sparse_tensors.weight_tensor 2256 2257 embedding_shape = (self.categorical_column._num_buckets, self.dimension) # pylint: disable=protected-access 2258 shared_embedding_collection = ops.get_collection( 2259 self.shared_embedding_collection_name) 2260 if shared_embedding_collection: 2261 if len(shared_embedding_collection) > 1: 2262 raise ValueError( 2263 'Collection {} can only contain one variable. ' 2264 'Suggested fix A: Choose a unique name for this collection. ' 2265 'Suggested fix B: Do not add any variables to this collection. ' 2266 'The feature_column library already adds a variable under the ' 2267 'hood.'.format(shared_embedding_collection)) 2268 embedding_weights = shared_embedding_collection[0] 2269 if embedding_weights.get_shape() != embedding_shape: 2270 raise ValueError( 2271 'Shared embedding collection {} contains variable {} of ' 2272 'unexpected shape {}. Expected shape is {}. ' 2273 'Suggested fix A: Choose a unique name for this collection. ' 2274 'Suggested fix B: Do not add any variables to this collection. ' 2275 'The feature_column library already adds a variable under the ' 2276 'hood.'.format( 2277 self.shared_embedding_collection_name, embedding_weights.name, 2278 embedding_weights.get_shape(), embedding_shape)) 2279 else: 2280 embedding_weights = variable_scope.get_variable( 2281 name='embedding_weights', 2282 shape=embedding_shape, 2283 dtype=dtypes.float32, 2284 initializer=self.initializer, 2285 trainable=self.trainable and trainable, 2286 collections=weight_collections) 2287 ops.add_to_collection( 2288 self.shared_embedding_collection_name, embedding_weights) 2289 if self.ckpt_to_load_from is not None: 2290 to_restore = embedding_weights 2291 if isinstance(to_restore, variables.PartitionedVariable): 2292 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access 2293 checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, { 2294 self.tensor_name_in_ckpt: to_restore 2295 }) 2296 2297 # Return embedding lookup result. 2298 return _safe_embedding_lookup_sparse( 2299 embedding_weights=embedding_weights, 2300 sparse_ids=sparse_ids, 2301 sparse_weights=sparse_weights, 2302 combiner=self.combiner, 2303 name='%s_weights' % self.name, 2304 max_norm=self.max_norm) 2305 2306 2307def _create_tuple(shape, value): 2308 """Returns a tuple with given shape and filled with value.""" 2309 if shape: 2310 return tuple([_create_tuple(shape[1:], value) for _ in range(shape[0])]) 2311 return value 2312 2313 2314def _as_tuple(value): 2315 if not nest.is_sequence(value): 2316 return value 2317 return tuple([_as_tuple(v) for v in value]) 2318 2319 2320def _check_shape(shape, key): 2321 """Returns shape if it's valid, raises error otherwise.""" 2322 assert shape is not None 2323 if not nest.is_sequence(shape): 2324 shape = [shape] 2325 shape = tuple(shape) 2326 for dimension in shape: 2327 if not isinstance(dimension, int): 2328 raise TypeError('shape dimensions must be integer. ' 2329 'shape: {}, key: {}'.format(shape, key)) 2330 if dimension < 1: 2331 raise ValueError('shape dimensions must be greater than 0. ' 2332 'shape: {}, key: {}'.format(shape, key)) 2333 return shape 2334 2335 2336def _is_shape_and_default_value_compatible(default_value, shape): 2337 """Verifies compatibility of shape and default_value.""" 2338 # Invalid condition: 2339 # * if default_value is not a scalar and shape is empty 2340 # * or if default_value is an iterable and shape is not empty 2341 if nest.is_sequence(default_value) != bool(shape): 2342 return False 2343 if not shape: 2344 return True 2345 if len(default_value) != shape[0]: 2346 return False 2347 for i in range(shape[0]): 2348 if not _is_shape_and_default_value_compatible(default_value[i], shape[1:]): 2349 return False 2350 return True 2351 2352 2353def _check_default_value(shape, default_value, dtype, key): 2354 """Returns default value as tuple if it's valid, otherwise raises errors. 2355 2356 This function verifies that `default_value` is compatible with both `shape` 2357 and `dtype`. If it is not compatible, it raises an error. If it is compatible, 2358 it casts default_value to a tuple and returns it. `key` is used only 2359 for error message. 2360 2361 Args: 2362 shape: An iterable of integers specifies the shape of the `Tensor`. 2363 default_value: If a single value is provided, the same value will be applied 2364 as the default value for every item. If an iterable of values is 2365 provided, the shape of the `default_value` should be equal to the given 2366 `shape`. 2367 dtype: defines the type of values. Default value is `tf.float32`. Must be a 2368 non-quantized, real integer or floating point type. 2369 key: Column name, used only for error messages. 2370 2371 Returns: 2372 A tuple which will be used as default value. 2373 2374 Raises: 2375 TypeError: if `default_value` is an iterable but not compatible with `shape` 2376 TypeError: if `default_value` is not compatible with `dtype`. 2377 ValueError: if `dtype` is not convertible to `tf.float32`. 2378 """ 2379 if default_value is None: 2380 return None 2381 2382 if isinstance(default_value, int): 2383 return _create_tuple(shape, default_value) 2384 2385 if isinstance(default_value, float) and dtype.is_floating: 2386 return _create_tuple(shape, default_value) 2387 2388 if callable(getattr(default_value, 'tolist', None)): # Handles numpy arrays 2389 default_value = default_value.tolist() 2390 2391 if nest.is_sequence(default_value): 2392 if not _is_shape_and_default_value_compatible(default_value, shape): 2393 raise ValueError( 2394 'The shape of default_value must be equal to given shape. ' 2395 'default_value: {}, shape: {}, key: {}'.format( 2396 default_value, shape, key)) 2397 # Check if the values in the list are all integers or are convertible to 2398 # floats. 2399 is_list_all_int = all( 2400 isinstance(v, int) for v in nest.flatten(default_value)) 2401 is_list_has_float = any( 2402 isinstance(v, float) for v in nest.flatten(default_value)) 2403 if is_list_all_int: 2404 return _as_tuple(default_value) 2405 if is_list_has_float and dtype.is_floating: 2406 return _as_tuple(default_value) 2407 raise TypeError('default_value must be compatible with dtype. ' 2408 'default_value: {}, dtype: {}, key: {}'.format( 2409 default_value, dtype, key)) 2410 2411 2412class _HashedCategoricalColumn( 2413 _CategoricalColumn, 2414 collections.namedtuple('_HashedCategoricalColumn', 2415 ['key', 'hash_bucket_size', 'dtype'])): 2416 """see `categorical_column_with_hash_bucket`.""" 2417 2418 @property 2419 def name(self): 2420 return self.key 2421 2422 @property 2423 def _parse_example_spec(self): 2424 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 2425 2426 def _transform_feature(self, inputs): 2427 input_tensor = _to_sparse_input(inputs.get(self.key)) 2428 if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2429 raise ValueError('SparseColumn input must be a SparseTensor.') 2430 2431 _assert_string_or_int( 2432 input_tensor.dtype, 2433 prefix='column_name: {} input_tensor'.format(self.key)) 2434 2435 if self.dtype.is_integer != input_tensor.dtype.is_integer: 2436 raise ValueError( 2437 'Column dtype and SparseTensors dtype must be compatible. ' 2438 'key: {}, column dtype: {}, tensor dtype: {}'.format( 2439 self.key, self.dtype, input_tensor.dtype)) 2440 2441 if self.dtype == dtypes.string: 2442 sparse_values = input_tensor.values 2443 else: 2444 sparse_values = string_ops.as_string(input_tensor.values) 2445 2446 sparse_id_values = string_ops.string_to_hash_bucket_fast( 2447 sparse_values, self.hash_bucket_size, name='lookup') 2448 return sparse_tensor_lib.SparseTensor( 2449 input_tensor.indices, sparse_id_values, input_tensor.dense_shape) 2450 2451 @property 2452 def _num_buckets(self): 2453 """Returns number of buckets in this sparse feature.""" 2454 return self.hash_bucket_size 2455 2456 def _get_sparse_tensors(self, inputs, weight_collections=None, 2457 trainable=None): 2458 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2459 2460 2461class _VocabularyFileCategoricalColumn( 2462 _CategoricalColumn, 2463 collections.namedtuple('_VocabularyFileCategoricalColumn', ( 2464 'key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'dtype', 2465 'default_value' 2466 ))): 2467 """See `categorical_column_with_vocabulary_file`.""" 2468 2469 @property 2470 def name(self): 2471 return self.key 2472 2473 @property 2474 def _parse_example_spec(self): 2475 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 2476 2477 def _transform_feature(self, inputs): 2478 input_tensor = _to_sparse_input(inputs.get(self.key)) 2479 2480 if self.dtype.is_integer != input_tensor.dtype.is_integer: 2481 raise ValueError( 2482 'Column dtype and SparseTensors dtype must be compatible. ' 2483 'key: {}, column dtype: {}, tensor dtype: {}'.format( 2484 self.key, self.dtype, input_tensor.dtype)) 2485 2486 _assert_string_or_int( 2487 input_tensor.dtype, 2488 prefix='column_name: {} input_tensor'.format(self.key)) 2489 2490 key_dtype = self.dtype 2491 if input_tensor.dtype.is_integer: 2492 # `index_table_from_file` requires 64-bit integer keys. 2493 key_dtype = dtypes.int64 2494 input_tensor = math_ops.to_int64(input_tensor) 2495 2496 return lookup_ops.index_table_from_file( 2497 vocabulary_file=self.vocabulary_file, 2498 num_oov_buckets=self.num_oov_buckets, 2499 vocab_size=self.vocabulary_size, 2500 default_value=self.default_value, 2501 key_dtype=key_dtype, 2502 name='{}_lookup'.format(self.key)).lookup(input_tensor) 2503 2504 @property 2505 def _num_buckets(self): 2506 """Returns number of buckets in this sparse feature.""" 2507 return self.vocabulary_size + self.num_oov_buckets 2508 2509 def _get_sparse_tensors( 2510 self, inputs, weight_collections=None, trainable=None): 2511 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2512 2513 2514class _VocabularyListCategoricalColumn( 2515 _CategoricalColumn, 2516 collections.namedtuple('_VocabularyListCategoricalColumn', ( 2517 'key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets' 2518 ))): 2519 """See `categorical_column_with_vocabulary_list`.""" 2520 2521 @property 2522 def name(self): 2523 return self.key 2524 2525 @property 2526 def _parse_example_spec(self): 2527 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 2528 2529 def _transform_feature(self, inputs): 2530 input_tensor = _to_sparse_input(inputs.get(self.key)) 2531 2532 if self.dtype.is_integer != input_tensor.dtype.is_integer: 2533 raise ValueError( 2534 'Column dtype and SparseTensors dtype must be compatible. ' 2535 'key: {}, column dtype: {}, tensor dtype: {}'.format( 2536 self.key, self.dtype, input_tensor.dtype)) 2537 2538 _assert_string_or_int( 2539 input_tensor.dtype, 2540 prefix='column_name: {} input_tensor'.format(self.key)) 2541 2542 key_dtype = self.dtype 2543 if input_tensor.dtype.is_integer: 2544 # `index_table_from_tensor` requires 64-bit integer keys. 2545 key_dtype = dtypes.int64 2546 input_tensor = math_ops.to_int64(input_tensor) 2547 2548 return lookup_ops.index_table_from_tensor( 2549 vocabulary_list=tuple(self.vocabulary_list), 2550 default_value=self.default_value, 2551 num_oov_buckets=self.num_oov_buckets, 2552 dtype=key_dtype, 2553 name='{}_lookup'.format(self.key)).lookup(input_tensor) 2554 2555 @property 2556 def _num_buckets(self): 2557 """Returns number of buckets in this sparse feature.""" 2558 return len(self.vocabulary_list) + self.num_oov_buckets 2559 2560 def _get_sparse_tensors( 2561 self, inputs, weight_collections=None, trainable=None): 2562 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2563 2564 2565class _IdentityCategoricalColumn( 2566 _CategoricalColumn, 2567 collections.namedtuple('_IdentityCategoricalColumn', ( 2568 'key', 'num_buckets', 'default_value' 2569 ))): 2570 2571 """See `categorical_column_with_identity`.""" 2572 2573 @property 2574 def name(self): 2575 return self.key 2576 2577 @property 2578 def _parse_example_spec(self): 2579 return {self.key: parsing_ops.VarLenFeature(dtypes.int64)} 2580 2581 def _transform_feature(self, inputs): 2582 input_tensor = _to_sparse_input(inputs.get(self.key)) 2583 2584 if not input_tensor.dtype.is_integer: 2585 raise ValueError( 2586 'Invalid input, not integer. key: {} dtype: {}'.format( 2587 self.key, input_tensor.dtype)) 2588 2589 values = math_ops.to_int64(input_tensor.values, name='values') 2590 num_buckets = math_ops.to_int64(self.num_buckets, name='num_buckets') 2591 zero = math_ops.to_int64(0, name='zero') 2592 if self.default_value is None: 2593 # Fail if values are out-of-range. 2594 assert_less = check_ops.assert_less( 2595 values, num_buckets, data=(values, num_buckets), 2596 name='assert_less_than_num_buckets') 2597 assert_greater = check_ops.assert_greater_equal( 2598 values, zero, data=(values,), 2599 name='assert_greater_or_equal_0') 2600 with ops.control_dependencies((assert_less, assert_greater)): 2601 values = array_ops.identity(values) 2602 else: 2603 # Assign default for out-of-range values. 2604 values = array_ops.where( 2605 math_ops.logical_or( 2606 values < zero, values >= num_buckets, name='out_of_range'), 2607 array_ops.fill( 2608 dims=array_ops.shape(values), 2609 value=math_ops.to_int64(self.default_value), 2610 name='default_values'), 2611 values) 2612 2613 return sparse_tensor_lib.SparseTensor( 2614 indices=input_tensor.indices, 2615 values=values, 2616 dense_shape=input_tensor.dense_shape) 2617 2618 @property 2619 def _num_buckets(self): 2620 """Returns number of buckets in this sparse feature.""" 2621 return self.num_buckets 2622 2623 def _get_sparse_tensors( 2624 self, inputs, weight_collections=None, trainable=None): 2625 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2626 2627 2628class _WeightedCategoricalColumn( 2629 _CategoricalColumn, 2630 collections.namedtuple('_WeightedCategoricalColumn', ( 2631 'categorical_column', 'weight_feature_key', 'dtype' 2632 ))): 2633 """See `weighted_categorical_column`.""" 2634 2635 @property 2636 def name(self): 2637 return '{}_weighted_by_{}'.format( 2638 self.categorical_column.name, self.weight_feature_key) 2639 2640 @property 2641 def _parse_example_spec(self): 2642 config = self.categorical_column._parse_example_spec # pylint: disable=protected-access 2643 if self.weight_feature_key in config: 2644 raise ValueError('Parse config {} already exists for {}.'.format( 2645 config[self.weight_feature_key], self.weight_feature_key)) 2646 config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype) 2647 return config 2648 2649 @property 2650 def _num_buckets(self): 2651 return self.categorical_column._num_buckets # pylint: disable=protected-access 2652 2653 def _transform_feature(self, inputs): 2654 weight_tensor = inputs.get(self.weight_feature_key) 2655 if weight_tensor is None: 2656 raise ValueError('Missing weights {}.'.format(self.weight_feature_key)) 2657 weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 2658 weight_tensor) 2659 if self.dtype != weight_tensor.dtype.base_dtype: 2660 raise ValueError('Bad dtype, expected {}, but got {}.'.format( 2661 self.dtype, weight_tensor.dtype)) 2662 if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor): 2663 # The weight tensor can be a regular Tensor. In this case, sparsify it. 2664 weight_tensor = _to_sparse_input(weight_tensor, ignore_value=0.0) 2665 if not weight_tensor.dtype.is_floating: 2666 weight_tensor = math_ops.to_float(weight_tensor) 2667 return (inputs.get(self.categorical_column), weight_tensor) 2668 2669 def _get_sparse_tensors( 2670 self, inputs, weight_collections=None, trainable=None): 2671 del weight_collections 2672 del trainable 2673 tensors = inputs.get(self) 2674 return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1]) 2675 2676 2677class _CrossedColumn( 2678 _CategoricalColumn, 2679 collections.namedtuple('_CrossedColumn', 2680 ['keys', 'hash_bucket_size', 'hash_key'])): 2681 """See `crossed_column`.""" 2682 2683 @property 2684 def name(self): 2685 feature_names = [] 2686 for key in _collect_leaf_level_keys(self): 2687 if isinstance(key, _FeatureColumn): 2688 feature_names.append(key.name) 2689 else: # key must be a string 2690 feature_names.append(key) 2691 return '_X_'.join(sorted(feature_names)) 2692 2693 @property 2694 def _parse_example_spec(self): 2695 config = {} 2696 for key in self.keys: 2697 if isinstance(key, _FeatureColumn): 2698 config.update(key._parse_example_spec) # pylint: disable=protected-access 2699 else: # key must be a string 2700 config.update({key: parsing_ops.VarLenFeature(dtypes.string)}) 2701 return config 2702 2703 def _transform_feature(self, inputs): 2704 feature_tensors = [] 2705 for key in _collect_leaf_level_keys(self): 2706 if isinstance(key, six.string_types): 2707 feature_tensors.append(inputs.get(key)) 2708 elif isinstance(key, _CategoricalColumn): 2709 ids_and_weights = key._get_sparse_tensors(inputs) # pylint: disable=protected-access 2710 if ids_and_weights.weight_tensor is not None: 2711 raise ValueError( 2712 'crossed_column does not support weight_tensor, but the given ' 2713 'column populates weight_tensor. ' 2714 'Given column: {}'.format(key.name)) 2715 feature_tensors.append(ids_and_weights.id_tensor) 2716 else: 2717 raise ValueError('Unsupported column type. Given: {}'.format(key)) 2718 return sparse_ops._sparse_cross_hashed( # pylint: disable=protected-access 2719 inputs=feature_tensors, 2720 num_buckets=self.hash_bucket_size, 2721 hash_key=self.hash_key) 2722 2723 @property 2724 def _num_buckets(self): 2725 """Returns number of buckets in this sparse feature.""" 2726 return self.hash_bucket_size 2727 2728 def _get_sparse_tensors(self, inputs, weight_collections=None, 2729 trainable=None): 2730 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2731 2732 2733def _collect_leaf_level_keys(cross): 2734 """Collects base keys by expanding all nested crosses. 2735 2736 Args: 2737 cross: A `_CrossedColumn`. 2738 2739 Returns: 2740 A list of strings or `_CategoricalColumn` instances. 2741 """ 2742 leaf_level_keys = [] 2743 for k in cross.keys: 2744 if isinstance(k, _CrossedColumn): 2745 leaf_level_keys.extend(_collect_leaf_level_keys(k)) 2746 else: 2747 leaf_level_keys.append(k) 2748 return leaf_level_keys 2749 2750 2751# TODO(zakaria): Move this to embedding_ops and make it public. 2752def _safe_embedding_lookup_sparse(embedding_weights, 2753 sparse_ids, 2754 sparse_weights=None, 2755 combiner='mean', 2756 default_id=None, 2757 name=None, 2758 partition_strategy='div', 2759 max_norm=None): 2760 """Lookup embedding results, accounting for invalid IDs and empty features. 2761 2762 The partitioned embedding in `embedding_weights` must all be the same shape 2763 except for the first dimension. The first dimension is allowed to vary as the 2764 vocabulary size is not necessarily a multiple of `P`. `embedding_weights` 2765 may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a 2766 partitioner. 2767 2768 Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs 2769 with non-positive weight. For an entry with no features, the embedding vector 2770 for `default_id` is returned, or the 0-vector if `default_id` is not supplied. 2771 2772 The ids and weights may be multi-dimensional. Embeddings are always aggregated 2773 along the last dimension. 2774 2775 Args: 2776 embedding_weights: A list of `P` float `Tensor`s or values representing 2777 partitioned embedding `Tensor`s. Alternatively, a `PartitionedVariable` 2778 created by partitioning along dimension 0. The total unpartitioned 2779 shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the 2780 vocab size and `e_1, ..., e_m` are the embedding dimensions. 2781 sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the 2782 ids. `d_0` is typically batch size. 2783 sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing 2784 float weights corresponding to `sparse_ids`, or `None` if all weights 2785 are be assumed to be 1.0. 2786 combiner: A string specifying how to combine embedding results for each 2787 entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" 2788 the default. 2789 default_id: The id to use for an entry with no features. 2790 name: A name for this operation (optional). 2791 partition_strategy: A string specifying the partitioning strategy. 2792 Currently `"div"` and `"mod"` are supported. Default is `"div"`. 2793 max_norm: If not `None`, all embeddings are l2-normalized to max_norm before 2794 combining. 2795 2796 2797 Returns: 2798 Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`. 2799 2800 Raises: 2801 ValueError: if `embedding_weights` is empty. 2802 """ 2803 if embedding_weights is None: 2804 raise ValueError('Missing embedding_weights %s.' % embedding_weights) 2805 if isinstance(embedding_weights, variables.PartitionedVariable): 2806 embedding_weights = list(embedding_weights) # get underlying Variables. 2807 if not isinstance(embedding_weights, list): 2808 embedding_weights = [embedding_weights] 2809 if len(embedding_weights) < 1: 2810 raise ValueError('Missing embedding_weights %s.' % embedding_weights) 2811 2812 dtype = sparse_weights.dtype if sparse_weights is not None else None 2813 embedding_weights = [ 2814 ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights 2815 ] 2816 2817 with ops.name_scope(name, 'embedding_lookup', 2818 embedding_weights + [sparse_ids, 2819 sparse_weights]) as scope: 2820 # Reshape higher-rank sparse ids and weights to linear segment ids. 2821 original_shape = sparse_ids.dense_shape 2822 original_rank_dim = sparse_ids.dense_shape.get_shape()[0] 2823 original_rank = ( 2824 array_ops.size(original_shape) 2825 if original_rank_dim.value is None 2826 else original_rank_dim.value) 2827 sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [ 2828 math_ops.reduce_prod( 2829 array_ops.slice(original_shape, [0], [original_rank - 1])), 2830 array_ops.gather(original_shape, original_rank - 1)]) 2831 if sparse_weights is not None: 2832 sparse_weights = sparse_tensor_lib.SparseTensor( 2833 sparse_ids.indices, 2834 sparse_weights.values, sparse_ids.dense_shape) 2835 2836 # Prune invalid ids and weights. 2837 sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights) 2838 2839 # Fill in dummy values for empty features, if necessary. 2840 sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids, 2841 default_id or 2842 0) 2843 if sparse_weights is not None: 2844 sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0) 2845 2846 result = embedding_ops.embedding_lookup_sparse( 2847 embedding_weights, 2848 sparse_ids, 2849 sparse_weights, 2850 combiner=combiner, 2851 partition_strategy=partition_strategy, 2852 name=None if default_id is None else scope, 2853 max_norm=max_norm) 2854 2855 if default_id is None: 2856 # Broadcast is_row_empty to the same shape as embedding_lookup_result, 2857 # for use in Select. 2858 is_row_empty = array_ops.tile( 2859 array_ops.reshape(is_row_empty, [-1, 1]), 2860 array_ops.stack([1, array_ops.shape(result)[1]])) 2861 2862 result = array_ops.where(is_row_empty, 2863 array_ops.zeros_like(result), 2864 result, 2865 name=scope) 2866 2867 # Reshape back from linear ids back into higher-dimensional dense result. 2868 final_result = array_ops.reshape( 2869 result, 2870 array_ops.concat([ 2871 array_ops.slice( 2872 math_ops.cast(original_shape, dtypes.int32), [0], 2873 [original_rank - 1]), 2874 array_ops.slice(array_ops.shape(result), [1], [-1]) 2875 ], 0)) 2876 final_result.set_shape(tensor_shape.unknown_shape( 2877 (original_rank_dim - 1).value).concatenate(result.get_shape()[1:])) 2878 return final_result 2879 2880 2881def _prune_invalid_ids(sparse_ids, sparse_weights): 2882 """Prune invalid IDs (< 0) from the input ids and weights.""" 2883 is_id_valid = math_ops.greater_equal(sparse_ids.values, 0) 2884 if sparse_weights is not None: 2885 is_id_valid = math_ops.logical_and( 2886 is_id_valid, math_ops.greater(sparse_weights.values, 0)) 2887 sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid) 2888 if sparse_weights is not None: 2889 sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid) 2890 return sparse_ids, sparse_weights 2891 2892 2893class _IndicatorColumn(_DenseColumn, 2894 collections.namedtuple('_IndicatorColumn', 2895 ['categorical_column'])): 2896 """Represents a one-hot column for use in deep networks. 2897 2898 Args: 2899 categorical_column: A `_CategoricalColumn` which is created by 2900 `categorical_column_with_*` function. 2901 """ 2902 2903 @property 2904 def name(self): 2905 return '{}_indicator'.format(self.categorical_column.name) 2906 2907 def _transform_feature(self, inputs): 2908 """Returns dense `Tensor` representing feature. 2909 2910 Args: 2911 inputs: A `_LazyBuilder` object to access inputs. 2912 2913 Returns: 2914 Transformed feature `Tensor`. 2915 2916 Raises: 2917 ValueError: if input rank is not known at graph building time. 2918 """ 2919 id_weight_pair = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 2920 id_tensor = id_weight_pair.id_tensor 2921 weight_tensor = id_weight_pair.weight_tensor 2922 2923 # If the underlying column is weighted, return the input as a dense tensor. 2924 if weight_tensor is not None: 2925 weighted_column = sparse_ops.sparse_merge( 2926 sp_ids=id_tensor, 2927 sp_values=weight_tensor, 2928 vocab_size=int(self._variable_shape[-1])) 2929 # Remove (?, -1) index 2930 weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0], 2931 weighted_column.dense_shape) 2932 return sparse_ops.sparse_tensor_to_dense(weighted_column) 2933 2934 dense_id_tensor = sparse_ops.sparse_tensor_to_dense( 2935 id_tensor, default_value=-1) 2936 2937 # One hot must be float for tf.concat reasons since all other inputs to 2938 # input_layer are float32. 2939 one_hot_id_tensor = array_ops.one_hot( 2940 dense_id_tensor, 2941 depth=self._variable_shape[-1], 2942 on_value=1.0, 2943 off_value=0.0) 2944 2945 # Reduce to get a multi-hot per example. 2946 return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2]) 2947 2948 @property 2949 def _parse_example_spec(self): 2950 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 2951 2952 @property 2953 def _variable_shape(self): 2954 """Returns a `TensorShape` representing the shape of the dense `Tensor`.""" 2955 return tensor_shape.TensorShape([1, self.categorical_column._num_buckets]) # pylint: disable=protected-access 2956 2957 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2958 """Returns dense `Tensor` representing feature. 2959 2960 Args: 2961 inputs: A `_LazyBuilder` object to access inputs. 2962 weight_collections: Unused `weight_collections` since no variables are 2963 created in this function. 2964 trainable: Unused `trainable` bool since no variables are created in 2965 this function. 2966 2967 Returns: 2968 Dense `Tensor` created within `_transform_feature`. 2969 """ 2970 # Do nothing with weight_collections and trainable since no variables are 2971 # created in this function. 2972 del weight_collections 2973 del trainable 2974 # Feature has been already transformed. Return the intermediate 2975 # representation created by _transform_feature. 2976 return inputs.get(self) 2977 2978 2979def _verify_static_batch_size_equality(tensors, columns): 2980 # bath_size is a tf.Dimension object. 2981 expected_batch_size = None 2982 for i in range(0, len(tensors)): 2983 if tensors[i].shape[0].value is not None: 2984 if expected_batch_size is None: 2985 bath_size_column_index = i 2986 expected_batch_size = tensors[i].shape[0] 2987 elif not expected_batch_size.is_compatible_with(tensors[i].shape[0]): 2988 raise ValueError( 2989 'Batch size (first dimension) of each feature must be same. ' 2990 'Batch size of columns ({}, {}): ({}, {})'.format( 2991 columns[bath_size_column_index].name, columns[i].name, 2992 expected_batch_size, tensors[i].shape[0])) 2993