1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""This API defines FeatureColumn abstraction.
16
17FeatureColumns provide a high level abstraction for ingesting and representing
18features. FeatureColumns are also the primary way of encoding features for
19canned ${tf.estimator.Estimator}s.
20
21When using FeatureColumns with `Estimators`, the type of feature column you
22should choose depends on (1) the feature type and (2) the model type.
23
241. Feature type:
25
26  * Continuous features can be represented by `numeric_column`.
27  * Categorical features can be represented by any `categorical_column_with_*`
28  column:
29    - `categorical_column_with_vocabulary_list`
30    - `categorical_column_with_vocabulary_file`
31    - `categorical_column_with_hash_bucket`
32    - `categorical_column_with_identity`
33    - `weighted_categorical_column`
34
352. Model type:
36
37  * Deep neural network models (`DNNClassifier`, `DNNRegressor`).
38
39    Continuous features can be directly fed into deep neural network models.
40
41      age_column = numeric_column("age")
42
43    To feed sparse features into DNN models, wrap the column with
44    `embedding_column` or `indicator_column`. `indicator_column` is recommended
45    for features with only a few possible values. For features with many
46    possible values, to reduce the size of your model, `embedding_column` is
47    recommended.
48
49      embedded_dept_column = embedding_column(
50          categorical_column_with_vocabulary_list(
51              "department", ["math", "philosphy", ...]), dimension=10)
52
53  * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
54
55    Sparse features can be fed directly into linear models. They behave like an
56    indicator column but with an efficient implementation.
57
58      dept_column = categorical_column_with_vocabulary_list("department",
59          ["math", "philosophy", "english"])
60
61    It is recommended that continuous features be bucketized before being
62    fed into linear models.
63
64      bucketized_age_column = bucketized_column(
65          source_column=age_column,
66          boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
67
68    Sparse features can be crossed (also known as conjuncted or combined) in
69    order to form non-linearities, and then fed into linear models.
70
71      cross_dept_age_column = crossed_column(
72          columns=["department", bucketized_age_column],
73          hash_bucket_size=1000)
74
75Example of building canned `Estimator`s using FeatureColumns:
76
77  ```python
78  # Define features and transformations
79  deep_feature_columns = [age_column, embedded_dept_column]
80  wide_feature_columns = [dept_column, bucketized_age_column,
81      cross_dept_age_column]
82
83  # Build deep model
84  estimator = DNNClassifier(
85      feature_columns=deep_feature_columns,
86      hidden_units=[500, 250, 50])
87  estimator.train(...)
88
89  # Or build a wide model
90  estimator = LinearClassifier(
91      feature_columns=wide_feature_columns)
92  estimator.train(...)
93
94  # Or build a wide and deep model!
95  estimator = DNNLinearCombinedClassifier(
96      linear_feature_columns=wide_feature_columns,
97      dnn_feature_columns=deep_feature_columns,
98      dnn_hidden_units=[500, 250, 50])
99  estimator.train(...)
100  ```
101
102
103FeatureColumns can also be transformed into a generic input layer for
104custom models using `input_layer`.
105
106Example of building model using FeatureColumns, this can be used in a
107`model_fn` which is given to the {tf.estimator.Estimator}:
108
109  ```python
110  # Building model via layers
111
112  deep_feature_columns = [age_column, embedded_dept_column]
113  columns_to_tensor = parse_feature_columns_from_examples(
114      serialized=my_data,
115      feature_columns=deep_feature_columns)
116  first_layer = input_layer(
117      features=columns_to_tensor,
118      feature_columns=deep_feature_columns)
119  second_layer = fully_connected(first_layer, ...)
120  ```
121
122NOTE: Functions prefixed with "_" indicate experimental or private parts of
123the API subject to change, and should not be relied upon!
124"""
125
126from __future__ import absolute_import
127from __future__ import division
128from __future__ import print_function
129
130import abc
131import collections
132import math
133
134import numpy as np
135import six
136
137
138from tensorflow.python.framework import dtypes
139from tensorflow.python.framework import ops
140from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
141from tensorflow.python.framework import tensor_shape
142from tensorflow.python.ops import array_ops
143from tensorflow.python.ops import check_ops
144from tensorflow.python.ops import control_flow_ops
145from tensorflow.python.ops import embedding_ops
146from tensorflow.python.ops import init_ops
147from tensorflow.python.ops import lookup_ops
148from tensorflow.python.ops import math_ops
149from tensorflow.python.ops import nn_ops
150from tensorflow.python.ops import parsing_ops
151from tensorflow.python.ops import sparse_ops
152from tensorflow.python.ops import string_ops
153from tensorflow.python.ops import template
154from tensorflow.python.ops import variable_scope
155from tensorflow.python.ops import variables
156from tensorflow.python.platform import gfile
157from tensorflow.python.platform import tf_logging as logging
158from tensorflow.python.training import checkpoint_utils
159from tensorflow.python.util import nest
160from tensorflow.python.util.tf_export import tf_export
161from tensorflow.python.util.tf_export import tf_export
162
163
164def _internal_input_layer(features,
165                          feature_columns,
166                          weight_collections=None,
167                          trainable=True,
168                          cols_to_vars=None,
169                          scope=None):
170  """See input_layer. `scope` is a name or variable scope to use."""
171
172  feature_columns = _clean_feature_columns(feature_columns)
173  for column in feature_columns:
174    if not isinstance(column, _DenseColumn):
175      raise ValueError(
176          'Items of feature_columns must be a _DenseColumn. '
177          'You can wrap a categorical column with an '
178          'embedding_column or indicator_column. Given: {}'.format(column))
179  weight_collections = list(weight_collections or [])
180  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
181    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
182  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
183    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
184
185  # a non-None `scope` can allow for variable reuse, when, e.g., this function
186  # is wrapped by a `make_template`.
187  with variable_scope.variable_scope(
188      scope, default_name='input_layer', values=features.values()):
189    builder = _LazyBuilder(features)
190    output_tensors = []
191    ordered_columns = []
192    for column in sorted(feature_columns, key=lambda x: x.name):
193      ordered_columns.append(column)
194      with variable_scope.variable_scope(
195          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
196        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
197            builder,
198            weight_collections=weight_collections,
199            trainable=trainable)
200        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
201        batch_size = array_ops.shape(tensor)[0]
202        output_tensors.append(
203            array_ops.reshape(tensor, shape=(batch_size, num_elements)))
204        if cols_to_vars is not None:
205          # Retrieve any variables created (some _DenseColumn's don't create
206          # variables, in which case an empty list is returned).
207          cols_to_vars[column] = ops.get_collection(
208              ops.GraphKeys.GLOBAL_VARIABLES,
209              scope=variable_scope.get_variable_scope().name)
210    _verify_static_batch_size_equality(output_tensors, ordered_columns)
211    return array_ops.concat(output_tensors, 1)
212
213
214@tf_export('feature_column.input_layer')
215def input_layer(features,
216                feature_columns,
217                weight_collections=None,
218                trainable=True,
219                cols_to_vars=None):
220  """Returns a dense `Tensor` as input layer based on given `feature_columns`.
221
222  Generally a single example in training data is described with FeatureColumns.
223  At the first layer of the model, this column oriented data should be converted
224  to a single `Tensor`.
225
226  Example:
227
228  ```python
229  price = numeric_column('price')
230  keywords_embedded = embedding_column(
231      categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
232  columns = [price, keywords_embedded, ...]
233  features = tf.parse_example(..., features=make_parse_example_spec(columns))
234  dense_tensor = input_layer(features, columns)
235  for units in [128, 64, 32]:
236    dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu)
237  prediction = tf.layers.dense(dense_tensor, 1)
238  ```
239
240  Args:
241    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
242      keys. For example `numeric_column('price')` will look at 'price' key in
243      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
244      corresponding `_FeatureColumn`.
245    feature_columns: An iterable containing the FeatureColumns to use as inputs
246      to your model. All items should be instances of classes derived from
247      `_DenseColumn` such as `numeric_column`, `embedding_column`,
248      `bucketized_column`, `indicator_column`. If you have categorical features,
249      you can wrap them with an `embedding_column` or `indicator_column`.
250    weight_collections: A list of collection names to which the Variable will be
251      added. Note that variables will also be added to collections
252      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
253    trainable: If `True` also add the variable to the graph collection
254      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
255    cols_to_vars: If not `None`, must be a dictionary that will be filled with a
256      mapping from `_FeatureColumn` to list of `Variable`s.  For example, after
257      the call, we might have cols_to_vars =
258      {_EmbeddingColumn(
259        categorical_column=_HashedCategoricalColumn(
260          key='sparse_feature', hash_bucket_size=5, dtype=tf.string),
261        dimension=10): [<tf.Variable 'some_variable:0' shape=(5, 10),
262                        <tf.Variable 'some_variable:1' shape=(5, 10)]}
263      If a column creates no variables, its value will be an empty list.
264
265  Returns:
266    A `Tensor` which represents input layer of a model. Its shape
267    is (batch_size, first_layer_dimension) and its dtype is `float32`.
268    first_layer_dimension is determined based on given `feature_columns`.
269
270  Raises:
271    ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
272  """
273  return _internal_input_layer(features, feature_columns, weight_collections,
274                               trainable, cols_to_vars)
275
276
277# TODO(akshayka): InputLayer should be a subclass of Layer, and it
278# should implement the logic in input_layer using Layer's build-and-call
279# paradigm; input_layer should create an instance of InputLayer and
280# return the result of inovking its apply method, just as functional layers do.
281class InputLayer(object):
282  """An object-oriented version of `input_layer` that reuses variables."""
283
284  def __init__(self,
285               feature_columns,
286               weight_collections=None,
287               trainable=True,
288               cols_to_vars=None):
289    """See `input_layer`."""
290
291    self._feature_columns = feature_columns
292    self._weight_collections = weight_collections
293    self._trainable = trainable
294    self._cols_to_vars = cols_to_vars
295    self._input_layer_template = template.make_template(
296        'feature_column_input_layer',
297        _internal_input_layer,
298        create_scope_now_=True)
299    self._scope = self._input_layer_template.variable_scope
300
301  def __call__(self, features):
302    return self._input_layer_template(
303        features=features,
304        feature_columns=self._feature_columns,
305        weight_collections=self._weight_collections,
306        trainable=self._trainable,
307        cols_to_vars=None,
308        scope=self._scope)
309
310  @property
311  def non_trainable_variables(self):
312    return self._input_layer_template.non_trainable_variables
313
314  @property
315  def non_trainable_weights(self):
316    return self._input_layer_template.non_trainable_weights
317
318  @property
319  def trainable_variables(self):
320    return self._input_layer_template.trainable_variables
321
322  @property
323  def trainable_weights(self):
324    return self._input_layer_template.trainable_weights
325
326  @property
327  def variables(self):
328    return self._input_layer_template.variables
329
330  @property
331  def weights(self):
332    return self._input_layer_template.weights
333
334
335@tf_export('feature_column.linear_model')
336def linear_model(features,
337                 feature_columns,
338                 units=1,
339                 sparse_combiner='sum',
340                 weight_collections=None,
341                 trainable=True,
342                 cols_to_vars=None):
343  """Returns a linear prediction `Tensor` based on given `feature_columns`.
344
345  This function generates a weighted sum based on output dimension `units`.
346  Weighted sum refers to logits in classification problems. It refers to the
347  prediction itself for linear regression problems.
348
349  Note on supported columns: `linear_model` treats categorical columns as
350  `indicator_column`s while `input_layer` explicitly requires wrapping each
351  of them with an `embedding_column` or an `indicator_column`.
352
353  Example:
354
355  ```python
356  price = numeric_column('price')
357  price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
358  keywords = categorical_column_with_hash_bucket("keywords", 10K)
359  keywords_price = crossed_column('keywords', price_buckets, ...)
360  columns = [price_buckets, keywords, keywords_price ...]
361  features = tf.parse_example(..., features=make_parse_example_spec(columns))
362  prediction = linear_model(features, columns)
363  ```
364
365  Args:
366    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
367      keys. For example `numeric_column('price')` will look at 'price' key in
368      this dict. Values are `Tensor` or `SparseTensor` depending on
369      corresponding `_FeatureColumn`.
370    feature_columns: An iterable containing the FeatureColumns to use as inputs
371      to your model. All items should be instances of classes derived from
372      `_FeatureColumn`s.
373    units: An integer, dimensionality of the output space. Default value is 1.
374    sparse_combiner: A string specifying how to reduce if a sparse column is
375      multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
376      the default. "sqrtn" often achieves good accuracy, in particular with
377      bag-of-words columns. It combines each sparse columns independently.
378        * "sum": do not normalize features in the column
379        * "mean": do l1 normalization on features in the column
380        * "sqrtn": do l2 normalization on features in the column
381    weight_collections: A list of collection names to which the Variable will be
382      added. Note that, variables will also be added to collections
383      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
384    trainable: If `True` also add the variable to the graph collection
385      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
386    cols_to_vars: If not `None`, must be a dictionary that will be filled with a
387      mapping from `_FeatureColumn` to associated list of `Variable`s.  For
388      example, after the call, we might have cols_to_vars = {
389        _NumericColumn(
390          key='numeric_feature1', shape=(1,):
391        [<tf.Variable 'linear_model/price2/weights:0' shape=(1, 1)>],
392        'bias': [<tf.Variable 'linear_model/bias_weights:0' shape=(1,)>],
393        _NumericColumn(
394          key='numeric_feature2', shape=(2,)):
395        [<tf.Variable 'linear_model/price1/weights:0' shape=(2, 1)>]}
396      If a column creates no variables, its value will be an empty list. Note
397      that cols_to_vars will also contain a string key 'bias' that maps to a
398      list of Variables.
399
400  Returns:
401    A `Tensor` which represents predictions/logits of a linear model. Its shape
402    is (batch_size, units) and its dtype is `float32`.
403
404  Raises:
405    ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
406      nor `_CategoricalColumn`.
407  """
408  feature_columns = _clean_feature_columns(feature_columns)
409  for column in feature_columns:
410    if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
411      raise ValueError('Items of feature_columns must be either a _DenseColumn '
412                       'or _CategoricalColumn. Given: {}'.format(column))
413  weight_collections = list(weight_collections or [])
414  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
415    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
416  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
417    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
418  with variable_scope.variable_scope(
419      None, default_name='linear_model', values=features.values()):
420    weighted_sums = []
421    ordered_columns = []
422    builder = _LazyBuilder(features)
423    for column in sorted(feature_columns, key=lambda x: x.name):
424      with variable_scope.variable_scope(
425          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
426        ordered_columns.append(column)
427        weighted_sum = _create_weighted_sum(
428            column=column,
429            builder=builder,
430            units=units,
431            sparse_combiner=sparse_combiner,
432            weight_collections=weight_collections,
433            trainable=trainable)
434        weighted_sums.append(weighted_sum)
435        if cols_to_vars is not None:
436          # Retrieve the variables created.
437          cols_to_vars[column] = ops.get_collection(
438              ops.GraphKeys.GLOBAL_VARIABLES,
439              scope=variable_scope.get_variable_scope().name)
440    _verify_static_batch_size_equality(weighted_sums, ordered_columns)
441    predictions_no_bias = math_ops.add_n(
442        weighted_sums, name='weighted_sum_no_bias')
443    bias = variable_scope.get_variable(
444        'bias_weights',
445        shape=[units],
446        initializer=init_ops.zeros_initializer(),
447        trainable=trainable,
448        collections=weight_collections)
449    predictions = nn_ops.bias_add(
450        predictions_no_bias, bias, name='weighted_sum')
451    if cols_to_vars is not None:
452      # Add the bias to cols_to_vars as well, converting the Variable or
453      # PartitionedVariable to a list of Variable's.
454      if isinstance(bias, variables.Variable):
455        cols_to_vars['bias'] = [bias]
456      else:  # Must be a PartitionedVariable.
457        cols_to_vars['bias'] = list(bias)
458    return predictions
459
460
461def _transform_features(features, feature_columns):
462  """Returns transformed features based on features columns passed in.
463
464  Please note that most probably you would not need to use this function. Please
465  check `input_layer` and `linear_model` to see whether they will
466  satisfy your use case or not.
467
468  Example:
469
470  ```python
471  # Define features and transformations
472  crosses_a_x_b = crossed_column(
473      columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000)
474  price_buckets = bucketized_column(
475      source_column=numeric_column("price"), boundaries=[...])
476
477  columns = [crosses_a_x_b, price_buckets]
478  features = tf.parse_example(..., features=make_parse_example_spec(columns))
479  transformed = transform_features(features=features, feature_columns=columns)
480
481  assertCountEqual(columns, transformed.keys())
482  ```
483
484  Args:
485    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
486      keys. For example `numeric_column('price')` will look at 'price' key in
487      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
488      corresponding `_FeatureColumn`.
489    feature_columns: An iterable containing all the `_FeatureColumn`s.
490
491  Returns:
492    A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values.
493  """
494  feature_columns = _clean_feature_columns(feature_columns)
495  outputs = {}
496  with ops.name_scope(
497      None, default_name='transform_features', values=features.values()):
498    builder = _LazyBuilder(features)
499    for column in sorted(feature_columns, key=lambda x: x.name):
500      with ops.name_scope(None, default_name=column.name):
501        outputs[column] = builder.get(column)
502  return outputs
503
504
505@tf_export('feature_column.make_parse_example_spec')
506def make_parse_example_spec(feature_columns):
507  """Creates parsing spec dictionary from input feature_columns.
508
509  The returned dictionary can be used as arg 'features' in `tf.parse_example`.
510
511  Typical usage example:
512
513  ```python
514  # Define features and transformations
515  feature_a = categorical_column_with_vocabulary_file(...)
516  feature_b = numeric_column(...)
517  feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...)
518  feature_a_x_feature_c = crossed_column(
519      columns=["feature_a", feature_c_bucketized], ...)
520
521  feature_columns = set(
522      [feature_b, feature_c_bucketized, feature_a_x_feature_c])
523  features = tf.parse_example(
524      serialized=serialized_examples,
525      features=make_parse_example_spec(feature_columns))
526  ```
527
528  For the above example, make_parse_example_spec would return the dict:
529
530  ```python
531  {
532      "feature_a": parsing_ops.VarLenFeature(tf.string),
533      "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
534      "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
535  }
536  ```
537
538  Args:
539    feature_columns: An iterable containing all feature columns. All items
540      should be instances of classes derived from `_FeatureColumn`.
541
542  Returns:
543    A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
544    value.
545
546  Raises:
547    ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
548      instance.
549  """
550  result = {}
551  for column in feature_columns:
552    if not isinstance(column, _FeatureColumn):
553      raise ValueError(
554          'All feature_columns must be _FeatureColumn instances. '
555          'Given: {}'.format(column))
556    config = column._parse_example_spec  # pylint: disable=protected-access
557    for key, value in six.iteritems(config):
558      if key in result and value != result[key]:
559        raise ValueError(
560            'feature_columns contain different parse_spec for key '
561            '{}. Given {} and {}'.format(key, value, result[key]))
562    result.update(config)
563  return result
564
565
566@tf_export('feature_column.embedding_column')
567def embedding_column(
568    categorical_column, dimension, combiner='mean', initializer=None,
569    ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
570    trainable=True):
571  """`_DenseColumn` that converts from sparse, categorical input.
572
573  Use this when your inputs are sparse, but you want to convert them to a dense
574  representation (e.g., to feed to a DNN).
575
576  Inputs must be a `_CategoricalColumn` created by any of the
577  `categorical_column_*` function. Here is an example of using
578  `embedding_column` with `DNNClassifier`:
579
580  ```python
581  video_id = categorical_column_with_identity(
582      key='video_id', num_buckets=1000000, default_value=0)
583  columns = [embedding_column(video_id, 9),...]
584
585  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
586
587  label_column = ...
588  def input_fn():
589    features = tf.parse_example(
590        ..., features=make_parse_example_spec(columns + [label_column]))
591    labels = features.pop(label_column.name)
592    return features, labels
593
594  estimator.train(input_fn=input_fn, steps=100)
595  ```
596
597  Here is an example using `embedding_column` with model_fn:
598
599  ```python
600  def model_fn(features, ...):
601    video_id = categorical_column_with_identity(
602        key='video_id', num_buckets=1000000, default_value=0)
603    columns = [embedding_column(video_id, 9),...]
604    dense_tensor = input_layer(features, columns)
605    # Form DNN layers, calculate loss, and return EstimatorSpec.
606    ...
607  ```
608
609  Args:
610    categorical_column: A `_CategoricalColumn` created by a
611      `categorical_column_with_*` function. This column produces the sparse IDs
612      that are inputs to the embedding lookup.
613    dimension: An integer specifying dimension of the embedding, must be > 0.
614    combiner: A string specifying how to reduce if there are multiple entries
615      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
616      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
617      with bag-of-words columns. Each of this can be thought as example level
618      normalizations on the column. For more information, see
619      `tf.embedding_lookup_sparse`.
620    initializer: A variable initializer function to be used in embedding
621      variable initialization. If not specified, defaults to
622      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
623      `1/sqrt(dimension)`.
624    ckpt_to_load_from: String representing checkpoint name/pattern from which to
625      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
626    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
627      which to restore the column weights. Required if `ckpt_to_load_from` is
628      not `None`.
629    max_norm: If not `None`, embedding values are l2-normalized to this value.
630    trainable: Whether or not the embedding is trainable. Default is True.
631
632  Returns:
633    `_DenseColumn` that converts from sparse input.
634
635  Raises:
636    ValueError: if `dimension` not > 0.
637    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
638      is specified.
639    ValueError: if `initializer` is specified and is not callable.
640    RuntimeError: If eager execution is enabled.
641  """
642  if (dimension is None) or (dimension < 1):
643    raise ValueError('Invalid dimension {}.'.format(dimension))
644  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
645    raise ValueError('Must specify both `ckpt_to_load_from` and '
646                     '`tensor_name_in_ckpt` or none of them.')
647
648  if (initializer is not None) and (not callable(initializer)):
649    raise ValueError('initializer must be callable if specified. '
650                     'Embedding of column_name: {}'.format(
651                         categorical_column.name))
652  if initializer is None:
653    initializer = init_ops.truncated_normal_initializer(
654        mean=0.0, stddev=1 / math.sqrt(dimension))
655
656  return _EmbeddingColumn(
657      categorical_column=categorical_column,
658      dimension=dimension,
659      combiner=combiner,
660      initializer=initializer,
661      ckpt_to_load_from=ckpt_to_load_from,
662      tensor_name_in_ckpt=tensor_name_in_ckpt,
663      max_norm=max_norm,
664      trainable=trainable)
665
666
667@tf_export('feature_column.shared_embedding_columns')
668def shared_embedding_columns(
669    categorical_columns, dimension, combiner='mean', initializer=None,
670    shared_embedding_collection_name=None, ckpt_to_load_from=None,
671    tensor_name_in_ckpt=None, max_norm=None, trainable=True):
672  """List of dense columns that convert from sparse, categorical input.
673
674  This is similar to `embedding_column`, except that that it produces a list of
675  embedding columns that share the same embedding weights.
676
677  Use this when your inputs are sparse and of the same type (e.g. watched and
678  impression video IDs that share the same vocabulary), and you want to convert
679  them to a dense representation (e.g., to feed to a DNN).
680
681  Inputs must be a list of categorical columns created by any of the
682  `categorical_column_*` function. They must all be of the same type and have
683  the same arguments except `key`. E.g. they can be
684  categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
685  all columns could also be weighted_categorical_column.
686
687  Here is an example embedding of two features for a DNNClassifier model:
688
689  ```python
690  watched_video_id = categorical_column_with_vocabulary_file(
691      'watched_video_id', video_vocabulary_file, video_vocabulary_size)
692  impression_video_id = categorical_column_with_vocabulary_file(
693      'impression_video_id', video_vocabulary_file, video_vocabulary_size)
694  columns = shared_embedding_columns(
695      [watched_video_id, impression_video_id], dimension=10)
696
697  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
698
699  label_column = ...
700  def input_fn():
701    features = tf.parse_example(
702        ..., features=make_parse_example_spec(columns + [label_column]))
703    labels = features.pop(label_column.name)
704    return features, labels
705
706  estimator.train(input_fn=input_fn, steps=100)
707  ```
708
709  Here is an example using `shared_embedding_columns` with model_fn:
710
711  ```python
712  def model_fn(features, ...):
713    watched_video_id = categorical_column_with_vocabulary_file(
714        'watched_video_id', video_vocabulary_file, video_vocabulary_size)
715    impression_video_id = categorical_column_with_vocabulary_file(
716        'impression_video_id', video_vocabulary_file, video_vocabulary_size)
717    columns = shared_embedding_columns(
718        [watched_video_id, impression_video_id], dimension=10)
719    dense_tensor = input_layer(features, columns)
720    # Form DNN layers, calculate loss, and return EstimatorSpec.
721    ...
722  ```
723
724  Args:
725    categorical_columns: List of categorical columns created by a
726      `categorical_column_with_*` function. These columns produce the sparse IDs
727      that are inputs to the embedding lookup. All columns must be of the same
728      type and have the same arguments except `key`. E.g. they can be
729      categorical_column_with_vocabulary_file with the same vocabulary_file.
730      Some or all columns could also be weighted_categorical_column.
731    dimension: An integer specifying dimension of the embedding, must be > 0.
732    combiner: A string specifying how to reduce if there are multiple entries
733      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
734      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
735      with bag-of-words columns. Each of this can be thought as example level
736      normalizations on the column. For more information, see
737      `tf.embedding_lookup_sparse`.
738    initializer: A variable initializer function to be used in embedding
739      variable initialization. If not specified, defaults to
740      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
741      `1/sqrt(dimension)`.
742    shared_embedding_collection_name: Optional name of the collection where
743      shared embedding weights are added. If not given, a reasonable name will
744      be chosen based on the names of `categorical_columns`. This is also used
745      in `variable_scope` when creating shared embedding weights.
746    ckpt_to_load_from: String representing checkpoint name/pattern from which to
747      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
748    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
749      which to restore the column weights. Required if `ckpt_to_load_from` is
750      not `None`.
751    max_norm: If not `None`, embedding values are l2-normalized to this value.
752    trainable: Whether or not the embedding is trainable. Default is True.
753
754  Returns:
755    A list of dense columns that converts from sparse input. The order of
756    results follows the ordering of `categorical_columns`.
757
758  Raises:
759    ValueError: if `dimension` not > 0.
760    ValueError: if any of the given `categorical_columns` is of different type
761      or has different arguments than the others.
762    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
763      is specified.
764    ValueError: if `initializer` is specified and is not callable.
765  """
766  if (dimension is None) or (dimension < 1):
767    raise ValueError('Invalid dimension {}.'.format(dimension))
768  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
769    raise ValueError('Must specify both `ckpt_to_load_from` and '
770                     '`tensor_name_in_ckpt` or none of them.')
771
772  if (initializer is not None) and (not callable(initializer)):
773    raise ValueError('initializer must be callable if specified.')
774  if initializer is None:
775    initializer = init_ops.truncated_normal_initializer(
776        mean=0.0, stddev=1. / math.sqrt(dimension))
777
778  # Sort the columns so the default collection name is deterministic even if the
779  # user passes columns from an unsorted collection, such as dict.values().
780  sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
781
782  c0 = sorted_columns[0]
783  if not isinstance(c0, _CategoricalColumn):
784    raise ValueError(
785        'All categorical_columns must be subclasses of _CategoricalColumn. '
786        'Given: {}, of type: {}'.format(c0, type(c0)))
787  if isinstance(c0, _WeightedCategoricalColumn):
788    c0 = c0.categorical_column
789  for c in sorted_columns[1:]:
790    if isinstance(c, _WeightedCategoricalColumn):
791      c = c.categorical_column
792    if not isinstance(c, type(c0)):
793      raise ValueError(
794          'To use shared_embedding_column, all categorical_columns must have '
795          'the same type, or be weighted_categorical_column of the same type. '
796          'Given column: {} of type: {} does not match given column: {} of '
797          'type: {}'.format(c0, type(c0), c, type(c)))
798
799  if not shared_embedding_collection_name:
800    shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
801    shared_embedding_collection_name += '_shared_embedding'
802
803  result = []
804  for column in categorical_columns:
805    result.append(_SharedEmbeddingColumn(
806        categorical_column=column,
807        dimension=dimension,
808        combiner=combiner,
809        initializer=initializer,
810        shared_embedding_collection_name=shared_embedding_collection_name,
811        ckpt_to_load_from=ckpt_to_load_from,
812        tensor_name_in_ckpt=tensor_name_in_ckpt,
813        max_norm=max_norm,
814        trainable=trainable))
815  return result
816
817
818@tf_export('feature_column.numeric_column')
819def numeric_column(key,
820                   shape=(1,),
821                   default_value=None,
822                   dtype=dtypes.float32,
823                   normalizer_fn=None):
824  """Represents real valued or numerical features.
825
826  Example:
827
828  ```python
829  price = numeric_column('price')
830  columns = [price, ...]
831  features = tf.parse_example(..., features=make_parse_example_spec(columns))
832  dense_tensor = input_layer(features, columns)
833
834  # or
835  bucketized_price = bucketized_column(price, boundaries=[...])
836  columns = [bucketized_price, ...]
837  features = tf.parse_example(..., features=make_parse_example_spec(columns))
838  linear_prediction = linear_model(features, columns)
839  ```
840
841  Args:
842    key: A unique string identifying the input feature. It is used as the
843      column name and the dictionary key for feature parsing configs, feature
844      `Tensor` objects, and feature columns.
845    shape: An iterable of integers specifies the shape of the `Tensor`. An
846      integer can be given which means a single dimension `Tensor` with given
847      width. The `Tensor` representing the column will have the shape of
848      [batch_size] + `shape`.
849    default_value: A single value compatible with `dtype` or an iterable of
850      values compatible with `dtype` which the column takes on during
851      `tf.Example` parsing if data is missing. A default value of `None` will
852      cause `tf.parse_example` to fail if an example does not contain this
853      column. If a single value is provided, the same value will be applied as
854      the default value for every item. If an iterable of values is provided,
855      the shape of the `default_value` should be equal to the given `shape`.
856    dtype: defines the type of values. Default value is `tf.float32`. Must be a
857      non-quantized, real integer or floating point type.
858    normalizer_fn: If not `None`, a function that can be used to normalize the
859      value of the tensor after `default_value` is applied for parsing.
860      Normalizer function takes the input `Tensor` as its argument, and returns
861      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
862      even though the most common use case of this function is normalization, it
863      can be used for any kind of Tensorflow transformations.
864
865  Returns:
866    A `_NumericColumn`.
867
868  Raises:
869    TypeError: if any dimension in shape is not an int
870    ValueError: if any dimension in shape is not a positive integer
871    TypeError: if `default_value` is an iterable but not compatible with `shape`
872    TypeError: if `default_value` is not compatible with `dtype`.
873    ValueError: if `dtype` is not convertible to `tf.float32`.
874  """
875  shape = _check_shape(shape, key)
876  if not (dtype.is_integer or dtype.is_floating):
877    raise ValueError('dtype must be convertible to float. '
878                     'dtype: {}, key: {}'.format(dtype, key))
879  default_value = _check_default_value(shape, default_value, dtype, key)
880
881  if normalizer_fn is not None and not callable(normalizer_fn):
882    raise TypeError(
883        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
884
885  return _NumericColumn(
886      key,
887      shape=shape,
888      default_value=default_value,
889      dtype=dtype,
890      normalizer_fn=normalizer_fn)
891
892
893@tf_export('feature_column.bucketized_column')
894def bucketized_column(source_column, boundaries):
895  """Represents discretized dense input.
896
897  Buckets include the left boundary, and exclude the right boundary. Namely,
898  `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
899  `[1., 2.)`, and `[2., +inf)`.
900
901  For example, if the inputs are
902
903  ```python
904  boundaries = [0, 10, 100]
905  input tensor = [[-5, 10000]
906                  [150,   10]
907                  [5,    100]]
908  ```
909
910  then the output will be
911
912  ```python
913  output = [[0, 3]
914            [3, 2]
915            [1, 3]]
916  ```
917
918  Example:
919
920  ```python
921  price = numeric_column('price')
922  bucketized_price = bucketized_column(price, boundaries=[...])
923  columns = [bucketized_price, ...]
924  features = tf.parse_example(..., features=make_parse_example_spec(columns))
925  linear_prediction = linear_model(features, columns)
926
927  # or
928  columns = [bucketized_price, ...]
929  features = tf.parse_example(..., features=make_parse_example_spec(columns))
930  dense_tensor = input_layer(features, columns)
931  ```
932
933  `bucketized_column` can also be crossed with another categorical column using
934  `crossed_column`:
935
936  ```python
937  price = numeric_column('price')
938  # bucketized_column converts numerical feature to a categorical one.
939  bucketized_price = bucketized_column(price, boundaries=[...])
940  # 'keywords' is a string feature.
941  price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K)
942  columns = [price_x_keywords, ...]
943  features = tf.parse_example(..., features=make_parse_example_spec(columns))
944  linear_prediction = linear_model(features, columns)
945  ```
946
947  Args:
948    source_column: A one-dimensional dense column which is generated with
949      `numeric_column`.
950    boundaries: A sorted list or tuple of floats specifying the boundaries.
951
952  Returns:
953    A `_BucketizedColumn`.
954
955  Raises:
956    ValueError: If `source_column` is not a numeric column, or if it is not
957      one-dimensional.
958    ValueError: If `boundaries` is not a sorted list or tuple.
959  """
960  if not isinstance(source_column, _NumericColumn):
961    raise ValueError(
962        'source_column must be a column generated with numeric_column(). '
963        'Given: {}'.format(source_column))
964  if len(source_column.shape) > 1:
965    raise ValueError(
966        'source_column must be one-dimensional column. '
967        'Given: {}'.format(source_column))
968  if (not boundaries or
969      not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):
970    raise ValueError('boundaries must be a sorted list.')
971  for i in range(len(boundaries) - 1):
972    if boundaries[i] >= boundaries[i + 1]:
973      raise ValueError('boundaries must be a sorted list.')
974  return _BucketizedColumn(source_column, tuple(boundaries))
975
976
977def _assert_string_or_int(dtype, prefix):
978  if (dtype != dtypes.string) and (not dtype.is_integer):
979    raise ValueError(
980        '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
981
982
983@tf_export('feature_column.categorical_column_with_hash_bucket')
984def categorical_column_with_hash_bucket(key,
985                                        hash_bucket_size,
986                                        dtype=dtypes.string):
987  """Represents sparse feature where ids are set by hashing.
988
989  Use this when your sparse features are in string or integer format, and you
990  want to distribute your inputs into a finite number of buckets by hashing.
991  output_id = Hash(input_feature_string) % bucket_size
992
993  For input dictionary `features`, `features[key]` is either `Tensor` or
994  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
995  and `''` for string. Note that these values are independent of the
996  `default_value` argument.
997
998  Example:
999
1000  ```python
1001  keywords = categorical_column_with_hash_bucket("keywords", 10K)
1002  columns = [keywords, ...]
1003  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1004  linear_prediction = linear_model(features, columns)
1005
1006  # or
1007  keywords_embedded = embedding_column(keywords, 16)
1008  columns = [keywords_embedded, ...]
1009  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1010  dense_tensor = input_layer(features, columns)
1011  ```
1012
1013  Args:
1014    key: A unique string identifying the input feature. It is used as the
1015      column name and the dictionary key for feature parsing configs, feature
1016      `Tensor` objects, and feature columns.
1017    hash_bucket_size: An int > 1. The number of buckets.
1018    dtype: The type of features. Only string and integer types are supported.
1019
1020  Returns:
1021    A `_HashedCategoricalColumn`.
1022
1023  Raises:
1024    ValueError: `hash_bucket_size` is not greater than 1.
1025    ValueError: `dtype` is neither string nor integer.
1026  """
1027  if hash_bucket_size is None:
1028    raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key))
1029
1030  if hash_bucket_size < 1:
1031    raise ValueError('hash_bucket_size must be at least 1. '
1032                     'hash_bucket_size: {}, key: {}'.format(
1033                         hash_bucket_size, key))
1034
1035  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1036
1037  return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
1038
1039
1040@tf_export('feature_column.categorical_column_with_vocabulary_file')
1041def categorical_column_with_vocabulary_file(key,
1042                                            vocabulary_file,
1043                                            vocabulary_size=None,
1044                                            num_oov_buckets=0,
1045                                            default_value=None,
1046                                            dtype=dtypes.string):
1047  """A `_CategoricalColumn` with a vocabulary file.
1048
1049  Use this when your inputs are in string or integer format, and you have a
1050  vocabulary file that maps each value to an integer ID. By default,
1051  out-of-vocabulary values are ignored. Use either (but not both) of
1052  `num_oov_buckets` and `default_value` to specify how to include
1053  out-of-vocabulary values.
1054
1055  For input dictionary `features`, `features[key]` is either `Tensor` or
1056  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1057  and `''` for string. Note that these values are independent of the
1058  `default_value` argument.
1059
1060  Example with `num_oov_buckets`:
1061  File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
1062  abbreviation. All inputs with values in that file are assigned an ID 0-49,
1063  corresponding to its line number. All other values are hashed and assigned an
1064  ID 50-54.
1065
1066  ```python
1067  states = categorical_column_with_vocabulary_file(
1068      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
1069      num_oov_buckets=5)
1070  columns = [states, ...]
1071  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1072  linear_prediction = linear_model(features, columns)
1073  ```
1074
1075  Example with `default_value`:
1076  File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
1077  other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
1078  in input, and other values missing from the file, will be assigned ID 0. All
1079  others are assigned the corresponding line number 1-50.
1080
1081  ```python
1082  states = categorical_column_with_vocabulary_file(
1083      key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
1084      default_value=0)
1085  columns = [states, ...]
1086  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1087  linear_prediction, _, _ = linear_model(features, columns)
1088  ```
1089
1090  And to make an embedding with either:
1091
1092  ```python
1093  columns = [embedding_column(states, 3),...]
1094  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1095  dense_tensor = input_layer(features, columns)
1096  ```
1097
1098  Args:
1099    key: A unique string identifying the input feature. It is used as the
1100      column name and the dictionary key for feature parsing configs, feature
1101      `Tensor` objects, and feature columns.
1102    vocabulary_file: The vocabulary file name.
1103    vocabulary_size: Number of the elements in the vocabulary. This must be no
1104      greater than length of `vocabulary_file`, if less than length, later
1105      values are ignored. If None, it is set to the length of `vocabulary_file`.
1106    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
1107      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
1108      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
1109      the input value. A positive `num_oov_buckets` can not be specified with
1110      `default_value`.
1111    default_value: The integer ID value to return for out-of-vocabulary feature
1112      values, defaults to `-1`. This can not be specified with a positive
1113      `num_oov_buckets`.
1114    dtype: The type of features. Only string and integer types are supported.
1115
1116  Returns:
1117    A `_CategoricalColumn` with a vocabulary file.
1118
1119  Raises:
1120    ValueError: `vocabulary_file` is missing or cannot be opened.
1121    ValueError: `vocabulary_size` is missing or < 1.
1122    ValueError: `num_oov_buckets` is a negative integer.
1123    ValueError: `num_oov_buckets` and `default_value` are both specified.
1124    ValueError: `dtype` is neither string nor integer.
1125  """
1126  if not vocabulary_file:
1127    raise ValueError('Missing vocabulary_file in {}.'.format(key))
1128
1129  if vocabulary_size is None:
1130    if not gfile.Exists(vocabulary_file):
1131      raise ValueError('vocabulary_file in {} does not exist.'.format(key))
1132
1133    with gfile.GFile(vocabulary_file) as f:
1134      vocabulary_size = sum(1 for _ in f)
1135    logging.info(
1136        'vocabulary_size = %d in %s is inferred from the number of elements '
1137        'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)
1138
1139  # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
1140  if vocabulary_size < 1:
1141    raise ValueError('Invalid vocabulary_size in {}.'.format(key))
1142  if num_oov_buckets:
1143    if default_value is not None:
1144      raise ValueError(
1145          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
1146              key))
1147    if num_oov_buckets < 0:
1148      raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
1149          num_oov_buckets, key))
1150  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1151  return _VocabularyFileCategoricalColumn(
1152      key=key,
1153      vocabulary_file=vocabulary_file,
1154      vocabulary_size=vocabulary_size,
1155      num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
1156      default_value=-1 if default_value is None else default_value,
1157      dtype=dtype)
1158
1159
1160@tf_export('feature_column.categorical_column_with_vocabulary_list')
1161def categorical_column_with_vocabulary_list(
1162    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
1163  """A `_CategoricalColumn` with in-memory vocabulary.
1164
1165  Use this when your inputs are in string or integer format, and you have an
1166  in-memory vocabulary mapping each value to an integer ID. By default,
1167  out-of-vocabulary values are ignored. Use either (but not both) of
1168  `num_oov_buckets` and `default_value` to specify how to include
1169  out-of-vocabulary values.
1170
1171  For input dictionary `features`, `features[key]` is either `Tensor` or
1172  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1173  and `''` for string. Note that these values are independent of the
1174  `default_value` argument.
1175
1176  Example with `num_oov_buckets`:
1177  In the following example, each input in `vocabulary_list` is assigned an ID
1178  0-3 corresponding to its index (e.g., input 'B' produces output 2). All other
1179  inputs are hashed and assigned an ID 4-5.
1180
1181  ```python
1182  colors = categorical_column_with_vocabulary_list(
1183      key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
1184      num_oov_buckets=2)
1185  columns = [colors, ...]
1186  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1187  linear_prediction, _, _ = linear_model(features, columns)
1188  ```
1189
1190  Example with `default_value`:
1191  In the following example, each input in `vocabulary_list` is assigned an ID
1192  0-4 corresponding to its index (e.g., input 'B' produces output 3). All other
1193  inputs are assigned `default_value` 0.
1194
1195
1196  ```python
1197  colors = categorical_column_with_vocabulary_list(
1198      key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0)
1199  columns = [colors, ...]
1200  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1201  linear_prediction, _, _ = linear_model(features, columns)
1202  ```
1203
1204  And to make an embedding with either:
1205
1206  ```python
1207  columns = [embedding_column(colors, 3),...]
1208  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1209  dense_tensor = input_layer(features, columns)
1210  ```
1211
1212  Args:
1213    key: A unique string identifying the input feature. It is used as the
1214      column name and the dictionary key for feature parsing configs, feature
1215      `Tensor` objects, and feature columns.
1216    vocabulary_list: An ordered iterable defining the vocabulary. Each feature
1217      is mapped to the index of its value (if present) in `vocabulary_list`.
1218      Must be castable to `dtype`.
1219    dtype: The type of features. Only string and integer types are supported.
1220      If `None`, it will be inferred from `vocabulary_list`.
1221    default_value: The integer ID value to return for out-of-vocabulary feature
1222      values, defaults to `-1`. This can not be specified with a positive
1223      `num_oov_buckets`.
1224    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
1225      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
1226      `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
1227      hash of the input value. A positive `num_oov_buckets` can not be specified
1228      with `default_value`.
1229
1230  Returns:
1231    A `_CategoricalColumn` with in-memory vocabulary.
1232
1233  Raises:
1234    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
1235    ValueError: `num_oov_buckets` is a negative integer.
1236    ValueError: `num_oov_buckets` and `default_value` are both specified.
1237    ValueError: if `dtype` is not integer or string.
1238  """
1239  if (vocabulary_list is None) or (len(vocabulary_list) < 1):
1240    raise ValueError(
1241        'vocabulary_list {} must be non-empty, column_name: {}'.format(
1242            vocabulary_list, key))
1243  if len(set(vocabulary_list)) != len(vocabulary_list):
1244    raise ValueError(
1245        'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
1246            vocabulary_list, key))
1247  vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)
1248  if num_oov_buckets:
1249    if default_value != -1:
1250      raise ValueError(
1251          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
1252              key))
1253    if num_oov_buckets < 0:
1254      raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
1255          num_oov_buckets, key))
1256  _assert_string_or_int(
1257      vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
1258  if dtype is None:
1259    dtype = vocabulary_dtype
1260  elif dtype.is_integer != vocabulary_dtype.is_integer:
1261    raise ValueError(
1262        'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
1263            dtype, vocabulary_dtype, key))
1264  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1265
1266  return _VocabularyListCategoricalColumn(
1267      key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype,
1268      default_value=default_value, num_oov_buckets=num_oov_buckets)
1269
1270
1271@tf_export('feature_column.categorical_column_with_identity')
1272def categorical_column_with_identity(key, num_buckets, default_value=None):
1273  """A `_CategoricalColumn` that returns identity values.
1274
1275  Use this when your inputs are integers in the range `[0, num_buckets)`, and
1276  you want to use the input value itself as the categorical ID. Values outside
1277  this range will result in `default_value` if specified, otherwise it will
1278  fail.
1279
1280  Typically, this is used for contiguous ranges of integer indexes, but
1281  it doesn't have to be. This might be inefficient, however, if many of IDs
1282  are unused. Consider `categorical_column_with_hash_bucket` in that case.
1283
1284  For input dictionary `features`, `features[key]` is either `Tensor` or
1285  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1286  and `''` for string. Note that these values are independent of the
1287  `default_value` argument.
1288
1289  In the following examples, each input in the range `[0, 1000000)` is assigned
1290  the same value. All other inputs are assigned `default_value` 0. Note that a
1291  literal 0 in inputs will result in the same default ID.
1292
1293  Linear model:
1294
1295  ```python
1296  video_id = categorical_column_with_identity(
1297      key='video_id', num_buckets=1000000, default_value=0)
1298  columns = [video_id, ...]
1299  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1300  linear_prediction, _, _ = linear_model(features, columns)
1301  ```
1302
1303  Embedding for a DNN model:
1304
1305  ```python
1306  columns = [embedding_column(video_id, 9),...]
1307  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1308  dense_tensor = input_layer(features, columns)
1309  ```
1310
1311  Args:
1312    key: A unique string identifying the input feature. It is used as the
1313      column name and the dictionary key for feature parsing configs, feature
1314      `Tensor` objects, and feature columns.
1315    num_buckets: Range of inputs and outputs is `[0, num_buckets)`.
1316    default_value: If `None`, this column's graph operations will fail for
1317      out-of-range inputs. Otherwise, this value must be in the range
1318      `[0, num_buckets)`, and will replace inputs in that range.
1319
1320  Returns:
1321    A `_CategoricalColumn` that returns identity values.
1322
1323  Raises:
1324    ValueError: if `num_buckets` is less than one.
1325    ValueError: if `default_value` is not in range `[0, num_buckets)`.
1326  """
1327  if num_buckets < 1:
1328    raise ValueError(
1329        'num_buckets {} < 1, column_name {}'.format(num_buckets, key))
1330  if (default_value is not None) and (
1331      (default_value < 0) or (default_value >= num_buckets)):
1332    raise ValueError(
1333        'default_value {} not in range [0, {}), column_name {}'.format(
1334            default_value, num_buckets, key))
1335  return _IdentityCategoricalColumn(
1336      key=key, num_buckets=num_buckets, default_value=default_value)
1337
1338
1339@tf_export('feature_column.indicator_column')
1340def indicator_column(categorical_column):
1341  """Represents multi-hot representation of given categorical column.
1342
1343  Used to wrap any `categorical_column_*` (e.g., to feed to DNN). Use
1344  `embedding_column` if the inputs are sparse.
1345
1346  ```python
1347  name = indicator_column(categorical_column_with_vocabulary_list(
1348      'name', ['bob', 'george', 'wanda'])
1349  columns = [name, ...]
1350  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1351  dense_tensor = input_layer(features, columns)
1352
1353  dense_tensor == [[1, 0, 0]]  # If "name" bytes_list is ["bob"]
1354  dense_tensor == [[1, 0, 1]]  # If "name" bytes_list is ["bob", "wanda"]
1355  dense_tensor == [[2, 0, 0]]  # If "name" bytes_list is ["bob", "bob"]
1356  ```
1357
1358  Args:
1359    categorical_column: A `_CategoricalColumn` which is created by
1360      `categorical_column_with_*` or `crossed_column` functions.
1361
1362  Returns:
1363    An `_IndicatorColumn`.
1364  """
1365  return _IndicatorColumn(categorical_column)
1366
1367
1368@tf_export('feature_column.weighted_categorical_column')
1369def weighted_categorical_column(
1370    categorical_column, weight_feature_key, dtype=dtypes.float32):
1371  """Applies weight values to a `_CategoricalColumn`.
1372
1373  Use this when each of your sparse inputs has both an ID and a value. For
1374  example, if you're representing text documents as a collection of word
1375  frequencies, you can provide 2 parallel sparse input features ('terms' and
1376  'frequencies' below).
1377
1378  Example:
1379
1380  Input `tf.Example` objects:
1381
1382  ```proto
1383  [
1384    features {
1385      feature {
1386        key: "terms"
1387        value {bytes_list {value: "very" value: "model"}}
1388      }
1389      feature {
1390        key: "frequencies"
1391        value {float_list {value: 0.3 value: 0.1}}
1392      }
1393    },
1394    features {
1395      feature {
1396        key: "terms"
1397        value {bytes_list {value: "when" value: "course" value: "human"}}
1398      }
1399      feature {
1400        key: "frequencies"
1401        value {float_list {value: 0.4 value: 0.1 value: 0.2}}
1402      }
1403    }
1404  ]
1405  ```
1406
1407  ```python
1408  categorical_column = categorical_column_with_hash_bucket(
1409      column_name='terms', hash_bucket_size=1000)
1410  weighted_column = weighted_categorical_column(
1411      categorical_column=categorical_column, weight_feature_key='frequencies')
1412  columns = [weighted_column, ...]
1413  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1414  linear_prediction, _, _ = linear_model(features, columns)
1415  ```
1416
1417  This assumes the input dictionary contains a `SparseTensor` for key
1418  'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have
1419  the same indices and dense shape.
1420
1421  Args:
1422    categorical_column: A `_CategoricalColumn` created by
1423      `categorical_column_with_*` functions.
1424    weight_feature_key: String key for weight values.
1425    dtype: Type of weights, such as `tf.float32`. Only float and integer weights
1426      are supported.
1427
1428  Returns:
1429    A `_CategoricalColumn` composed of two sparse features: one represents id,
1430    the other represents weight (value) of the id feature in that example.
1431
1432  Raises:
1433    ValueError: if `dtype` is not convertible to float.
1434  """
1435  if (dtype is None) or not (dtype.is_integer or dtype.is_floating):
1436    raise ValueError('dtype {} is not convertible to float.'.format(dtype))
1437  return _WeightedCategoricalColumn(
1438      categorical_column=categorical_column,
1439      weight_feature_key=weight_feature_key,
1440      dtype=dtype)
1441
1442
1443@tf_export('feature_column.crossed_column')
1444def crossed_column(keys, hash_bucket_size, hash_key=None):
1445  """Returns a column for performing crosses of categorical features.
1446
1447  Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
1448  the transformation can be thought of as:
1449    Hash(cartesian product of features) % `hash_bucket_size`
1450
1451  For example, if the input features are:
1452
1453  * SparseTensor referred by first key:
1454
1455    ```python
1456    shape = [2, 2]
1457    {
1458        [0, 0]: "a"
1459        [1, 0]: "b"
1460        [1, 1]: "c"
1461    }
1462    ```
1463
1464  * SparseTensor referred by second key:
1465
1466    ```python
1467    shape = [2, 1]
1468    {
1469        [0, 0]: "d"
1470        [1, 0]: "e"
1471    }
1472    ```
1473
1474  then crossed feature will look like:
1475
1476  ```python
1477   shape = [2, 2]
1478  {
1479      [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
1480      [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
1481      [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
1482  }
1483  ```
1484
1485  Here is an example to create a linear model with crosses of string features:
1486
1487  ```python
1488  keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)
1489  columns = [keywords_x_doc_terms, ...]
1490  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1491  linear_prediction = linear_model(features, columns)
1492  ```
1493
1494  You could also use vocabulary lookup before crossing:
1495
1496  ```python
1497  keywords = categorical_column_with_vocabulary_file(
1498      'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)
1499  keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K)
1500  columns = [keywords_x_doc_terms, ...]
1501  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1502  linear_prediction = linear_model(features, columns)
1503  ```
1504
1505  If an input feature is of numeric type, you can use
1506  `categorical_column_with_identity`, or `bucketized_column`, as in the example:
1507
1508  ```python
1509  # vertical_id is an integer categorical feature.
1510  vertical_id = categorical_column_with_identity('vertical_id', 10K)
1511  price = numeric_column('price')
1512  # bucketized_column converts numerical feature to a categorical one.
1513  bucketized_price = bucketized_column(price, boundaries=[...])
1514  vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
1515  columns = [vertical_id_x_price, ...]
1516  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1517  linear_prediction = linear_model(features, columns)
1518  ```
1519
1520  To use crossed column in DNN model, you need to add it in an embedding column
1521  as in this example:
1522
1523  ```python
1524  vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
1525  vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)
1526  dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...])
1527  ```
1528
1529  Args:
1530    keys: An iterable identifying the features to be crossed. Each element can
1531      be either:
1532      * string: Will use the corresponding feature which must be of string type.
1533      * `_CategoricalColumn`: Will use the transformed tensor produced by this
1534        column. Does not support hashed categorical column.
1535    hash_bucket_size: An int > 1. The number of buckets.
1536    hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
1537      function to combine the crosses fingerprints on SparseCrossOp (optional).
1538
1539  Returns:
1540    A `_CrossedColumn`.
1541
1542  Raises:
1543    ValueError: If `len(keys) < 2`.
1544    ValueError: If any of the keys is neither a string nor `_CategoricalColumn`.
1545    ValueError: If any of the keys is `_HashedCategoricalColumn`.
1546    ValueError: If `hash_bucket_size < 1`.
1547  """
1548  if not hash_bucket_size or hash_bucket_size < 1:
1549    raise ValueError('hash_bucket_size must be > 1. '
1550                     'hash_bucket_size: {}'.format(hash_bucket_size))
1551  if not keys or len(keys) < 2:
1552    raise ValueError(
1553        'keys must be a list with length > 1. Given: {}'.format(keys))
1554  for key in keys:
1555    if (not isinstance(key, six.string_types) and
1556        not isinstance(key, _CategoricalColumn)):
1557      raise ValueError(
1558          'Unsupported key type. All keys must be either string, or '
1559          'categorical column except _HashedCategoricalColumn. '
1560          'Given: {}'.format(key))
1561    if isinstance(key, _HashedCategoricalColumn):
1562      raise ValueError(
1563          'categorical_column_with_hash_bucket is not supported for crossing. '
1564          'Hashing before crossing will increase probability of collision. '
1565          'Instead, use the feature name as a string. Given: {}'.format(key))
1566  return _CrossedColumn(
1567      keys=tuple(keys), hash_bucket_size=hash_bucket_size,
1568      hash_key=hash_key)
1569
1570
1571class _FeatureColumn(object):
1572  """Represents a feature column abstraction.
1573
1574  WARNING: Do not subclass this layer unless you know what you are doing:
1575  the API is subject to future changes.
1576
1577  To distinguish the concept of a feature family and a specific binary feature
1578  within a family, we refer to a feature family like "country" as a feature
1579  column. Following is an example feature in a `tf.Example` format:
1580    {key: "country",  value: [ "US" ]}
1581  In this example the value of feature is "US" and "country" refers to the
1582  column of the feature.
1583
1584  This class is an abstract class. User should not create instances of this.
1585  """
1586  __metaclass__ = abc.ABCMeta
1587
1588  @abc.abstractproperty
1589  def name(self):
1590    """Returns string. Used for naming and for name_scope."""
1591    pass
1592
1593  @property
1594  def _var_scope_name(self):
1595    """Returns string. Used for variable_scope. Defaults to self.name."""
1596    return self.name
1597
1598  @abc.abstractmethod
1599  def _transform_feature(self, inputs):
1600    """Returns intermediate representation (usually a `Tensor`).
1601
1602    Uses `inputs` to create an intermediate representation (usually a `Tensor`)
1603    that other feature columns can use.
1604
1605    Example usage of `inputs`:
1606    Let's say a Feature column depends on raw feature ('raw') and another
1607    `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will
1608    be used as follows:
1609
1610    ```python
1611    raw_tensor = inputs.get('raw')
1612    fc_tensor = inputs.get(input_fc)
1613    ```
1614
1615    Args:
1616      inputs: A `_LazyBuilder` object to access inputs.
1617
1618    Returns:
1619      Transformed feature `Tensor`.
1620    """
1621    pass
1622
1623  @abc.abstractproperty
1624  def _parse_example_spec(self):
1625    """Returns a `tf.Example` parsing spec as dict.
1626
1627    It is used for get_parsing_spec for `tf.parse_example`. Returned spec is a
1628    dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
1629    supported objects. Please check documentation of ${tf.parse_example} for all
1630    supported spec objects.
1631
1632    Let's say a Feature column depends on raw feature ('raw') and another
1633    `_FeatureColumn` (input_fc). One possible implementation of
1634    _parse_example_spec is as follows:
1635
1636    ```python
1637    spec = {'raw': tf.FixedLenFeature(...)}
1638    spec.update(input_fc._parse_example_spec)
1639    return spec
1640    ```
1641    """
1642    pass
1643
1644
1645class _DenseColumn(_FeatureColumn):
1646  """Represents a column which can be represented as `Tensor`.
1647
1648  WARNING: Do not subclass this layer unless you know what you are doing:
1649  the API is subject to future changes.
1650
1651  Some examples of this type are: numeric_column, embedding_column,
1652  indicator_column.
1653  """
1654
1655  __metaclass__ = abc.ABCMeta
1656
1657  @abc.abstractproperty
1658  def _variable_shape(self):
1659    """`TensorShape` of `_get_dense_tensor`, without batch dimension."""
1660    pass
1661
1662  @abc.abstractmethod
1663  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
1664    """Returns a `Tensor`.
1665
1666    The output of this function will be used by model-builder-functions. For
1667    example the pseudo code of `input_layer` will be like:
1668
1669    ```python
1670    def input_layer(features, feature_columns, ...):
1671      outputs = [fc._get_dense_tensor(...) for fc in feature_columns]
1672      return tf.concat(outputs)
1673    ```
1674
1675    Args:
1676      inputs: A `_LazyBuilder` object to access inputs.
1677      weight_collections: List of graph collections to which Variables (if any
1678        will be created) are added.
1679      trainable: If `True` also add variables to the graph collection
1680        `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.Variable}).
1681
1682    Returns:
1683      `Tensor` of shape [batch_size] + `_variable_shape`.
1684    """
1685    pass
1686
1687
1688def _create_weighted_sum(
1689    column,
1690    builder,
1691    units,
1692    sparse_combiner,
1693    weight_collections,
1694    trainable):
1695  """Creates a weighted sum for a dense or sparse column for linear_model."""
1696  if isinstance(column, _CategoricalColumn):
1697    return _create_categorical_column_weighted_sum(
1698        column=column,
1699        builder=builder,
1700        units=units,
1701        sparse_combiner=sparse_combiner,
1702        weight_collections=weight_collections,
1703        trainable=trainable)
1704  else:
1705    return _create_dense_column_weighted_sum(
1706        column=column,
1707        builder=builder,
1708        units=units,
1709        weight_collections=weight_collections,
1710        trainable=trainable)
1711
1712
1713def _create_dense_column_weighted_sum(
1714    column, builder, units, weight_collections, trainable):
1715  """Create a weighted sum of a dense column for linear_model."""
1716  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
1717      builder,
1718      weight_collections=weight_collections,
1719      trainable=trainable)
1720  num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
1721  batch_size = array_ops.shape(tensor)[0]
1722  tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
1723  weight = variable_scope.get_variable(
1724      name='weights',
1725      shape=[num_elements, units],
1726      initializer=init_ops.zeros_initializer(),
1727      trainable=trainable,
1728      collections=weight_collections)
1729  return math_ops.matmul(tensor, weight, name='weighted_sum')
1730
1731
1732class _CategoricalColumn(_FeatureColumn):
1733  """Represents a categorical feature.
1734
1735  WARNING: Do not subclass this layer unless you know what you are doing:
1736  the API is subject to future changes.
1737
1738  A categorical feature typically handled with a ${tf.SparseTensor} of IDs.
1739  """
1740  __metaclass__ = abc.ABCMeta
1741
1742  IdWeightPair = collections.namedtuple(  # pylint: disable=invalid-name
1743      'IdWeightPair', ['id_tensor', 'weight_tensor'])
1744
1745  @abc.abstractproperty
1746  def _num_buckets(self):
1747    """Returns number of buckets in this sparse feature."""
1748    pass
1749
1750  @abc.abstractmethod
1751  def _get_sparse_tensors(self,
1752                          inputs,
1753                          weight_collections=None,
1754                          trainable=None):
1755    """Returns an IdWeightPair.
1756
1757    `IdWeightPair` is a pair of `SparseTensor`s which represents ids and
1758    weights.
1759
1760    `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets`
1761    `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a
1762    `SparseTensor` of `float` or `None` to indicate all weights should be
1763    taken to be 1. If specified, `weight_tensor` must have exactly the same
1764    shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing
1765    output of a `VarLenFeature` which is a ragged matrix.
1766
1767    Args:
1768      inputs: A `LazyBuilder` as a cache to get input tensors required to
1769        create `IdWeightPair`.
1770      weight_collections: List of graph collections to which variables (if any
1771        will be created) are added.
1772      trainable: If `True` also add variables to the graph collection
1773        `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.get_variable}).
1774    """
1775    pass
1776
1777
1778def _create_categorical_column_weighted_sum(
1779    column, builder, units, sparse_combiner, weight_collections, trainable):
1780  """Create a weighted sum of a categorical column for linear_model."""
1781  sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
1782      builder,
1783      weight_collections=weight_collections,
1784      trainable=trainable)
1785  id_tensor = sparse_ops.sparse_reshape(sparse_tensors.id_tensor, [
1786      array_ops.shape(sparse_tensors.id_tensor)[0], -1
1787  ])
1788  weight_tensor = sparse_tensors.weight_tensor
1789  if weight_tensor is not None:
1790    weight_tensor = sparse_ops.sparse_reshape(
1791        weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
1792
1793  weight = variable_scope.get_variable(
1794      name='weights',
1795      shape=(column._num_buckets, units),  # pylint: disable=protected-access
1796      initializer=init_ops.zeros_initializer(),
1797      trainable=trainable,
1798      collections=weight_collections)
1799  return _safe_embedding_lookup_sparse(
1800      weight,
1801      id_tensor,
1802      sparse_weights=weight_tensor,
1803      combiner=sparse_combiner,
1804      name='weighted_sum')
1805
1806
1807class _LazyBuilder(object):
1808  """Handles caching of transformations while building the model.
1809
1810  `_FeatureColumn` specifies how to digest an input column to the network. Some
1811  feature columns require data transformations. This class caches those
1812  transformations.
1813
1814  Some features may be used in more than one place. For example, one can use a
1815  bucketized feature by itself and a cross with it. In that case we
1816  should create only one bucketization op instead of creating ops for each
1817  feature column separately. To handle re-use of transformed columns,
1818  `_LazyBuilder` caches all previously transformed columns.
1819
1820  Example:
1821  We're trying to use the following `_FeatureColumn`s:
1822
1823  ```python
1824  bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)
1825  keywords = fc.categorical_column_with_hash_buckets("keywords", ...)
1826  age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])
1827  ... = linear_model(features,
1828                          [bucketized_age, keywords, age_X_keywords]
1829  ```
1830
1831  If we transform each column independently, then we'll get duplication of
1832  bucketization (one for cross, one for bucketization itself).
1833  The `_LazyBuilder` eliminates this duplication.
1834  """
1835
1836  def __init__(self, features):
1837    """Creates a `_LazyBuilder`.
1838
1839    Args:
1840      features: A mapping from feature column to objects that are `Tensor` or
1841        `SparseTensor`, or can be converted to same via
1842        `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key
1843        signifies a base feature (not-transformed). A `_FeatureColumn` key
1844        means that this `Tensor` is the output of an existing `_FeatureColumn`
1845        which can be reused.
1846    """
1847    self._features = features.copy()
1848    self._feature_tensors = {}
1849
1850  def get(self, key):
1851    """Returns a `Tensor` for the given key.
1852
1853    A `str` key is used to access a base feature (not-transformed). When a
1854    `_FeatureColumn` is passed, the transformed feature is returned if it
1855    already exists, otherwise the given `_FeatureColumn` is asked to provide its
1856    transformed output, which is then cached.
1857
1858    Args:
1859      key: a `str` or a `_FeatureColumn`.
1860
1861    Returns:
1862      The transformed `Tensor` corresponding to the `key`.
1863
1864    Raises:
1865      ValueError: if key is not found or a transformed `Tensor` cannot be
1866        computed.
1867    """
1868    if key in self._feature_tensors:
1869      # FeatureColumn is already transformed or converted.
1870      return self._feature_tensors[key]
1871
1872    if key in self._features:
1873      feature_tensor = self._get_raw_feature_as_tensor(key)
1874      self._feature_tensors[key] = feature_tensor
1875      return feature_tensor
1876
1877    if not isinstance(key, (str, _FeatureColumn)):
1878      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
1879                      'Provided: {}'.format(key))
1880
1881    if not isinstance(key, _FeatureColumn):
1882      raise ValueError('Feature {} is not in features dictionary.'.format(key))
1883
1884    column = key
1885    logging.debug('Transforming feature_column %s.', column)
1886    transformed = column._transform_feature(self)  # pylint: disable=protected-access
1887    if transformed is None:
1888      raise ValueError('Column {} is not supported.'.format(column.name))
1889    self._feature_tensors[column] = transformed
1890    return transformed
1891
1892  def _get_raw_feature_as_tensor(self, key):
1893    """Gets the raw_feature (keyed by `key`) as `tensor`.
1894
1895    The raw feature is converted to (sparse) tensor and maybe expand dim.
1896
1897    For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if
1898    the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will
1899    error out as it is not supported.
1900
1901    Args:
1902      key: A `str` key to access the raw feature.
1903
1904    Returns:
1905      A `Tensor` or `SparseTensor`.
1906
1907    Raises:
1908      ValueError: if the raw feature has rank 0.
1909    """
1910    raw_feature = self._features[key]
1911    feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
1912        raw_feature)
1913
1914    def expand_dims(input_tensor):
1915      # Input_tensor must have rank 1.
1916      if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
1917        return sparse_ops.sparse_reshape(
1918            input_tensor, [array_ops.shape(input_tensor)[0], -1])
1919      else:
1920        return array_ops.expand_dims(input_tensor, -1)
1921
1922    rank = feature_tensor.get_shape().ndims
1923    if rank is not None:
1924      if rank == 0:
1925        raise ValueError(
1926            'Feature (key: {}) cannot have rank 0. Give: {}'.format(
1927                key, feature_tensor))
1928      return feature_tensor if rank != 1 else expand_dims(feature_tensor)
1929
1930    # Handle dynamic rank.
1931    with ops.control_dependencies([
1932        check_ops.assert_positive(
1933            array_ops.rank(feature_tensor),
1934            message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
1935                key, feature_tensor))]):
1936      return control_flow_ops.cond(
1937          math_ops.equal(1, array_ops.rank(feature_tensor)),
1938          lambda: expand_dims(feature_tensor),
1939          lambda: feature_tensor)
1940
1941
1942# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
1943def _shape_offsets(shape):
1944  """Returns moving offset for each dimension given shape."""
1945  offsets = []
1946  for dim in reversed(shape):
1947    if offsets:
1948      offsets.append(dim * offsets[-1])
1949    else:
1950      offsets.append(dim)
1951  offsets.reverse()
1952  return offsets
1953
1954
1955# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
1956def _to_sparse_input(input_tensor, ignore_value=None):
1957  """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.
1958
1959  If `input_tensor` is already a `SparseTensor`, just return it.
1960
1961  Args:
1962    input_tensor: A string or integer `Tensor`.
1963    ignore_value: Entries in `dense_tensor` equal to this value will be
1964      absent from the resulting `SparseTensor`. If `None`, default value of
1965      `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`).
1966
1967  Returns:
1968    A `SparseTensor` with the same shape as `input_tensor`.
1969
1970  Raises:
1971    ValueError: when `input_tensor`'s rank is `None`.
1972  """
1973  input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
1974      input_tensor)
1975  if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
1976    return input_tensor
1977  with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)):
1978    if ignore_value is None:
1979      if input_tensor.dtype == dtypes.string:
1980        # Exception due to TF strings are converted to numpy objects by default.
1981        ignore_value = ''
1982      elif input_tensor.dtype.is_integer:
1983        ignore_value = -1  # -1 has a special meaning of missing feature
1984      else:
1985        # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is
1986        # constructing a new numpy object of the given type, which yields the
1987        # default value for that type.
1988        ignore_value = input_tensor.dtype.as_numpy_dtype()
1989    ignore_value = math_ops.cast(
1990        ignore_value, input_tensor.dtype, name='ignore_value')
1991    indices = array_ops.where(
1992        math_ops.not_equal(input_tensor, ignore_value), name='indices')
1993    return sparse_tensor_lib.SparseTensor(
1994        indices=indices,
1995        values=array_ops.gather_nd(input_tensor, indices, name='values'),
1996        dense_shape=array_ops.shape(
1997            input_tensor, out_type=dtypes.int64, name='dense_shape'))
1998
1999
2000def _clean_feature_columns(feature_columns):
2001  """Verifies and normalizes `feature_columns` input."""
2002  if isinstance(feature_columns, _FeatureColumn):
2003    feature_columns = [feature_columns]
2004
2005  if isinstance(feature_columns, collections.Iterator):
2006    feature_columns = list(feature_columns)
2007
2008  if isinstance(feature_columns, dict):
2009    raise ValueError('Expected feature_columns to be iterable, found dict.')
2010
2011  for column in feature_columns:
2012    if not isinstance(column, _FeatureColumn):
2013      raise ValueError('Items of feature_columns must be a _FeatureColumn. '
2014                       'Given (type {}): {}.'.format(type(column), column))
2015  if not feature_columns:
2016    raise ValueError('feature_columns must not be empty.')
2017  name_to_column = dict()
2018  for column in feature_columns:
2019    if column.name in name_to_column:
2020      raise ValueError('Duplicate feature column name found for columns: {} '
2021                       'and {}. This usually means that these columns refer to '
2022                       'same base feature. Either one must be discarded or a '
2023                       'duplicated but renamed item must be inserted in '
2024                       'features dict.'.format(column,
2025                                               name_to_column[column.name]))
2026    name_to_column[column.name] = column
2027
2028  return feature_columns
2029
2030
2031class _NumericColumn(_DenseColumn,
2032                     collections.namedtuple('_NumericColumn', [
2033                         'key', 'shape', 'default_value', 'dtype',
2034                         'normalizer_fn'
2035                     ])):
2036  """see `numeric_column`."""
2037
2038  @property
2039  def name(self):
2040    return self.key
2041
2042  @property
2043  def _parse_example_spec(self):
2044    return {
2045        self.key:
2046            parsing_ops.FixedLenFeature(self.shape, self.dtype,
2047                                        self.default_value)
2048    }
2049
2050  def _transform_feature(self, inputs):
2051    input_tensor = inputs.get(self.key)
2052    if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2053      raise ValueError(
2054          'The corresponding Tensor of numerical column must be a Tensor. '
2055          'SparseTensor is not supported. key: {}'.format(self.key))
2056    if self.normalizer_fn is not None:
2057      input_tensor = self.normalizer_fn(input_tensor)
2058    return math_ops.to_float(input_tensor)
2059
2060  @property
2061  def _variable_shape(self):
2062    return tensor_shape.TensorShape(self.shape)
2063
2064  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2065    """Returns dense `Tensor` representing numeric feature.
2066
2067    Args:
2068      inputs: A `_LazyBuilder` object to access inputs.
2069      weight_collections: Unused `weight_collections` since no variables are
2070        created in this function.
2071      trainable: Unused `trainable` bool since no variables are created in
2072        this function.
2073
2074    Returns:
2075      Dense `Tensor` created within `_transform_feature`.
2076    """
2077    # Do nothing with weight_collections and trainable since no variables are
2078    # created in this function.
2079    del weight_collections
2080    del trainable
2081    # Feature has been already transformed. Return the intermediate
2082    # representation created by _transform_feature.
2083    return inputs.get(self)
2084
2085
2086class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
2087                        collections.namedtuple('_BucketizedColumn', [
2088                            'source_column', 'boundaries'])):
2089  """See `bucketized_column`."""
2090
2091  @property
2092  def name(self):
2093    return '{}_bucketized'.format(self.source_column.name)
2094
2095  @property
2096  def _parse_example_spec(self):
2097    return self.source_column._parse_example_spec  # pylint: disable=protected-access
2098
2099  def _transform_feature(self, inputs):
2100    source_tensor = inputs.get(self.source_column)
2101    return math_ops._bucketize(  # pylint: disable=protected-access
2102        source_tensor,
2103        boundaries=self.boundaries)
2104
2105  @property
2106  def _variable_shape(self):
2107    return tensor_shape.TensorShape(
2108        tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
2109
2110  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2111    del weight_collections
2112    del trainable
2113    input_tensor = inputs.get(self)
2114    return array_ops.one_hot(
2115        indices=math_ops.to_int64(input_tensor),
2116        depth=len(self.boundaries) + 1,
2117        on_value=1.,
2118        off_value=0.)
2119
2120  @property
2121  def _num_buckets(self):
2122    # By construction, source_column is always one-dimensional.
2123    return (len(self.boundaries) + 1) * self.source_column.shape[0]
2124
2125  def _get_sparse_tensors(self, inputs, weight_collections=None,
2126                          trainable=None):
2127    input_tensor = inputs.get(self)
2128    batch_size = array_ops.shape(input_tensor)[0]
2129    # By construction, source_column is always one-dimensional.
2130    source_dimension = self.source_column.shape[0]
2131
2132    i1 = array_ops.reshape(
2133        array_ops.tile(
2134            array_ops.expand_dims(math_ops.range(0, batch_size), 1),
2135            [1, source_dimension]),
2136        (-1,))
2137    i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])
2138    # Flatten the bucket indices and unique them across dimensions
2139    # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
2140    bucket_indices = (
2141        array_ops.reshape(input_tensor, (-1,)) +
2142        (len(self.boundaries) + 1) * i2)
2143
2144    indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2))))
2145    dense_shape = math_ops.to_int64(array_ops.stack(
2146        [batch_size, source_dimension]))
2147    sparse_tensor = sparse_tensor_lib.SparseTensor(
2148        indices=indices,
2149        values=bucket_indices,
2150        dense_shape=dense_shape)
2151    return _CategoricalColumn.IdWeightPair(sparse_tensor, None)
2152
2153
2154class _EmbeddingColumn(
2155    _DenseColumn,
2156    collections.namedtuple('_EmbeddingColumn', (
2157        'categorical_column', 'dimension', 'combiner', 'initializer',
2158        'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'
2159    ))):
2160  """See `embedding_column`."""
2161
2162  @property
2163  def name(self):
2164    if not hasattr(self, '_name'):
2165      self._name = '{}_embedding'.format(self.categorical_column.name)
2166    return self._name
2167
2168  @property
2169  def _parse_example_spec(self):
2170    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
2171
2172  def _transform_feature(self, inputs):
2173    return inputs.get(self.categorical_column)
2174
2175  @property
2176  def _variable_shape(self):
2177    if not hasattr(self, '_shape'):
2178      self._shape = tensor_shape.vector(self.dimension)
2179    return self._shape
2180
2181  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2182    # Get sparse IDs and weights.
2183    sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
2184        inputs, weight_collections=weight_collections, trainable=trainable)
2185    sparse_ids = sparse_tensors.id_tensor
2186    sparse_weights = sparse_tensors.weight_tensor
2187
2188    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
2189    embedding_weights = variable_scope.get_variable(
2190        name='embedding_weights',
2191        shape=embedding_shape,
2192        dtype=dtypes.float32,
2193        initializer=self.initializer,
2194        trainable=self.trainable and trainable,
2195        collections=weight_collections)
2196    if self.ckpt_to_load_from is not None:
2197      to_restore = embedding_weights
2198      if isinstance(to_restore, variables.PartitionedVariable):
2199        to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
2200      checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
2201          self.tensor_name_in_ckpt: to_restore
2202      })
2203
2204    # Return embedding lookup result.
2205    return _safe_embedding_lookup_sparse(
2206        embedding_weights=embedding_weights,
2207        sparse_ids=sparse_ids,
2208        sparse_weights=sparse_weights,
2209        combiner=self.combiner,
2210        name='%s_weights' % self.name,
2211        max_norm=self.max_norm)
2212
2213
2214class _SharedEmbeddingColumn(
2215    _DenseColumn,
2216    collections.namedtuple('_SharedEmbeddingColumn', (
2217        'categorical_column', 'dimension', 'combiner', 'initializer',
2218        'shared_embedding_collection_name', 'ckpt_to_load_from',
2219        'tensor_name_in_ckpt', 'max_norm', 'trainable'
2220    ))):
2221  """See `embedding_column`."""
2222
2223  @property
2224  def name(self):
2225    if not hasattr(self, '_name'):
2226      self._name = '{}_shared_embedding'.format(self.categorical_column.name)
2227    return self._name
2228
2229  @property
2230  def _var_scope_name(self):
2231    return self.shared_embedding_collection_name
2232
2233  @property
2234  def _parse_example_spec(self):
2235    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
2236
2237  def _transform_feature(self, inputs):
2238    return inputs.get(self.categorical_column)
2239
2240  @property
2241  def _variable_shape(self):
2242    if not hasattr(self, '_shape'):
2243      self._shape = tensor_shape.vector(self.dimension)
2244    return self._shape
2245
2246  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2247    # This method is called from a variable_scope with name _var_scope_name,
2248    # which is shared among all shared embeddings. Open a name_scope here, so
2249    # that the ops for different columns have distinct names.
2250    with ops.name_scope(None, default_name=self.name):
2251      # Get sparse IDs and weights.
2252      sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
2253          inputs, weight_collections=weight_collections, trainable=trainable)
2254      sparse_ids = sparse_tensors.id_tensor
2255      sparse_weights = sparse_tensors.weight_tensor
2256
2257      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
2258      shared_embedding_collection = ops.get_collection(
2259          self.shared_embedding_collection_name)
2260      if shared_embedding_collection:
2261        if len(shared_embedding_collection) > 1:
2262          raise ValueError(
2263              'Collection {} can only contain one variable. '
2264              'Suggested fix A: Choose a unique name for this collection. '
2265              'Suggested fix B: Do not add any variables to this collection. '
2266              'The feature_column library already adds a variable under the '
2267              'hood.'.format(shared_embedding_collection))
2268        embedding_weights = shared_embedding_collection[0]
2269        if embedding_weights.get_shape() != embedding_shape:
2270          raise ValueError(
2271              'Shared embedding collection {} contains variable {} of '
2272              'unexpected shape {}. Expected shape is {}. '
2273              'Suggested fix A: Choose a unique name for this collection. '
2274              'Suggested fix B: Do not add any variables to this collection. '
2275              'The feature_column library already adds a variable under the '
2276              'hood.'.format(
2277                  self.shared_embedding_collection_name, embedding_weights.name,
2278                  embedding_weights.get_shape(), embedding_shape))
2279      else:
2280        embedding_weights = variable_scope.get_variable(
2281            name='embedding_weights',
2282            shape=embedding_shape,
2283            dtype=dtypes.float32,
2284            initializer=self.initializer,
2285            trainable=self.trainable and trainable,
2286            collections=weight_collections)
2287        ops.add_to_collection(
2288            self.shared_embedding_collection_name, embedding_weights)
2289      if self.ckpt_to_load_from is not None:
2290        to_restore = embedding_weights
2291        if isinstance(to_restore, variables.PartitionedVariable):
2292          to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
2293        checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
2294            self.tensor_name_in_ckpt: to_restore
2295        })
2296
2297      # Return embedding lookup result.
2298      return _safe_embedding_lookup_sparse(
2299          embedding_weights=embedding_weights,
2300          sparse_ids=sparse_ids,
2301          sparse_weights=sparse_weights,
2302          combiner=self.combiner,
2303          name='%s_weights' % self.name,
2304          max_norm=self.max_norm)
2305
2306
2307def _create_tuple(shape, value):
2308  """Returns a tuple with given shape and filled with value."""
2309  if shape:
2310    return tuple([_create_tuple(shape[1:], value) for _ in range(shape[0])])
2311  return value
2312
2313
2314def _as_tuple(value):
2315  if not nest.is_sequence(value):
2316    return value
2317  return tuple([_as_tuple(v) for v in value])
2318
2319
2320def _check_shape(shape, key):
2321  """Returns shape if it's valid, raises error otherwise."""
2322  assert shape is not None
2323  if not nest.is_sequence(shape):
2324    shape = [shape]
2325  shape = tuple(shape)
2326  for dimension in shape:
2327    if not isinstance(dimension, int):
2328      raise TypeError('shape dimensions must be integer. '
2329                      'shape: {}, key: {}'.format(shape, key))
2330    if dimension < 1:
2331      raise ValueError('shape dimensions must be greater than 0. '
2332                       'shape: {}, key: {}'.format(shape, key))
2333  return shape
2334
2335
2336def _is_shape_and_default_value_compatible(default_value, shape):
2337  """Verifies compatibility of shape and default_value."""
2338  # Invalid condition:
2339  #  * if default_value is not a scalar and shape is empty
2340  #  * or if default_value is an iterable and shape is not empty
2341  if nest.is_sequence(default_value) != bool(shape):
2342    return False
2343  if not shape:
2344    return True
2345  if len(default_value) != shape[0]:
2346    return False
2347  for i in range(shape[0]):
2348    if not _is_shape_and_default_value_compatible(default_value[i], shape[1:]):
2349      return False
2350  return True
2351
2352
2353def _check_default_value(shape, default_value, dtype, key):
2354  """Returns default value as tuple if it's valid, otherwise raises errors.
2355
2356  This function verifies that `default_value` is compatible with both `shape`
2357  and `dtype`. If it is not compatible, it raises an error. If it is compatible,
2358  it casts default_value to a tuple and returns it. `key` is used only
2359  for error message.
2360
2361  Args:
2362    shape: An iterable of integers specifies the shape of the `Tensor`.
2363    default_value: If a single value is provided, the same value will be applied
2364      as the default value for every item. If an iterable of values is
2365      provided, the shape of the `default_value` should be equal to the given
2366      `shape`.
2367    dtype: defines the type of values. Default value is `tf.float32`. Must be a
2368      non-quantized, real integer or floating point type.
2369    key: Column name, used only for error messages.
2370
2371  Returns:
2372    A tuple which will be used as default value.
2373
2374  Raises:
2375    TypeError: if `default_value` is an iterable but not compatible with `shape`
2376    TypeError: if `default_value` is not compatible with `dtype`.
2377    ValueError: if `dtype` is not convertible to `tf.float32`.
2378  """
2379  if default_value is None:
2380    return None
2381
2382  if isinstance(default_value, int):
2383    return _create_tuple(shape, default_value)
2384
2385  if isinstance(default_value, float) and dtype.is_floating:
2386    return _create_tuple(shape, default_value)
2387
2388  if callable(getattr(default_value, 'tolist', None)):  # Handles numpy arrays
2389    default_value = default_value.tolist()
2390
2391  if nest.is_sequence(default_value):
2392    if not _is_shape_and_default_value_compatible(default_value, shape):
2393      raise ValueError(
2394          'The shape of default_value must be equal to given shape. '
2395          'default_value: {}, shape: {}, key: {}'.format(
2396              default_value, shape, key))
2397    # Check if the values in the list are all integers or are convertible to
2398    # floats.
2399    is_list_all_int = all(
2400        isinstance(v, int) for v in nest.flatten(default_value))
2401    is_list_has_float = any(
2402        isinstance(v, float) for v in nest.flatten(default_value))
2403    if is_list_all_int:
2404      return _as_tuple(default_value)
2405    if is_list_has_float and dtype.is_floating:
2406      return _as_tuple(default_value)
2407  raise TypeError('default_value must be compatible with dtype. '
2408                  'default_value: {}, dtype: {}, key: {}'.format(
2409                      default_value, dtype, key))
2410
2411
2412class _HashedCategoricalColumn(
2413    _CategoricalColumn,
2414    collections.namedtuple('_HashedCategoricalColumn',
2415                           ['key', 'hash_bucket_size', 'dtype'])):
2416  """see `categorical_column_with_hash_bucket`."""
2417
2418  @property
2419  def name(self):
2420    return self.key
2421
2422  @property
2423  def _parse_example_spec(self):
2424    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
2425
2426  def _transform_feature(self, inputs):
2427    input_tensor = _to_sparse_input(inputs.get(self.key))
2428    if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2429      raise ValueError('SparseColumn input must be a SparseTensor.')
2430
2431    _assert_string_or_int(
2432        input_tensor.dtype,
2433        prefix='column_name: {} input_tensor'.format(self.key))
2434
2435    if self.dtype.is_integer != input_tensor.dtype.is_integer:
2436      raise ValueError(
2437          'Column dtype and SparseTensors dtype must be compatible. '
2438          'key: {}, column dtype: {}, tensor dtype: {}'.format(
2439              self.key, self.dtype, input_tensor.dtype))
2440
2441    if self.dtype == dtypes.string:
2442      sparse_values = input_tensor.values
2443    else:
2444      sparse_values = string_ops.as_string(input_tensor.values)
2445
2446    sparse_id_values = string_ops.string_to_hash_bucket_fast(
2447        sparse_values, self.hash_bucket_size, name='lookup')
2448    return sparse_tensor_lib.SparseTensor(
2449        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
2450
2451  @property
2452  def _num_buckets(self):
2453    """Returns number of buckets in this sparse feature."""
2454    return self.hash_bucket_size
2455
2456  def _get_sparse_tensors(self, inputs, weight_collections=None,
2457                          trainable=None):
2458    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2459
2460
2461class _VocabularyFileCategoricalColumn(
2462    _CategoricalColumn,
2463    collections.namedtuple('_VocabularyFileCategoricalColumn', (
2464        'key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'dtype',
2465        'default_value'
2466    ))):
2467  """See `categorical_column_with_vocabulary_file`."""
2468
2469  @property
2470  def name(self):
2471    return self.key
2472
2473  @property
2474  def _parse_example_spec(self):
2475    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
2476
2477  def _transform_feature(self, inputs):
2478    input_tensor = _to_sparse_input(inputs.get(self.key))
2479
2480    if self.dtype.is_integer != input_tensor.dtype.is_integer:
2481      raise ValueError(
2482          'Column dtype and SparseTensors dtype must be compatible. '
2483          'key: {}, column dtype: {}, tensor dtype: {}'.format(
2484              self.key, self.dtype, input_tensor.dtype))
2485
2486    _assert_string_or_int(
2487        input_tensor.dtype,
2488        prefix='column_name: {} input_tensor'.format(self.key))
2489
2490    key_dtype = self.dtype
2491    if input_tensor.dtype.is_integer:
2492      # `index_table_from_file` requires 64-bit integer keys.
2493      key_dtype = dtypes.int64
2494      input_tensor = math_ops.to_int64(input_tensor)
2495
2496    return lookup_ops.index_table_from_file(
2497        vocabulary_file=self.vocabulary_file,
2498        num_oov_buckets=self.num_oov_buckets,
2499        vocab_size=self.vocabulary_size,
2500        default_value=self.default_value,
2501        key_dtype=key_dtype,
2502        name='{}_lookup'.format(self.key)).lookup(input_tensor)
2503
2504  @property
2505  def _num_buckets(self):
2506    """Returns number of buckets in this sparse feature."""
2507    return self.vocabulary_size + self.num_oov_buckets
2508
2509  def _get_sparse_tensors(
2510      self, inputs, weight_collections=None, trainable=None):
2511    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2512
2513
2514class _VocabularyListCategoricalColumn(
2515    _CategoricalColumn,
2516    collections.namedtuple('_VocabularyListCategoricalColumn', (
2517        'key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'
2518    ))):
2519  """See `categorical_column_with_vocabulary_list`."""
2520
2521  @property
2522  def name(self):
2523    return self.key
2524
2525  @property
2526  def _parse_example_spec(self):
2527    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
2528
2529  def _transform_feature(self, inputs):
2530    input_tensor = _to_sparse_input(inputs.get(self.key))
2531
2532    if self.dtype.is_integer != input_tensor.dtype.is_integer:
2533      raise ValueError(
2534          'Column dtype and SparseTensors dtype must be compatible. '
2535          'key: {}, column dtype: {}, tensor dtype: {}'.format(
2536              self.key, self.dtype, input_tensor.dtype))
2537
2538    _assert_string_or_int(
2539        input_tensor.dtype,
2540        prefix='column_name: {} input_tensor'.format(self.key))
2541
2542    key_dtype = self.dtype
2543    if input_tensor.dtype.is_integer:
2544      # `index_table_from_tensor` requires 64-bit integer keys.
2545      key_dtype = dtypes.int64
2546      input_tensor = math_ops.to_int64(input_tensor)
2547
2548    return lookup_ops.index_table_from_tensor(
2549        vocabulary_list=tuple(self.vocabulary_list),
2550        default_value=self.default_value,
2551        num_oov_buckets=self.num_oov_buckets,
2552        dtype=key_dtype,
2553        name='{}_lookup'.format(self.key)).lookup(input_tensor)
2554
2555  @property
2556  def _num_buckets(self):
2557    """Returns number of buckets in this sparse feature."""
2558    return len(self.vocabulary_list) + self.num_oov_buckets
2559
2560  def _get_sparse_tensors(
2561      self, inputs, weight_collections=None, trainable=None):
2562    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2563
2564
2565class _IdentityCategoricalColumn(
2566    _CategoricalColumn,
2567    collections.namedtuple('_IdentityCategoricalColumn', (
2568        'key', 'num_buckets', 'default_value'
2569    ))):
2570
2571  """See `categorical_column_with_identity`."""
2572
2573  @property
2574  def name(self):
2575    return self.key
2576
2577  @property
2578  def _parse_example_spec(self):
2579    return {self.key: parsing_ops.VarLenFeature(dtypes.int64)}
2580
2581  def _transform_feature(self, inputs):
2582    input_tensor = _to_sparse_input(inputs.get(self.key))
2583
2584    if not input_tensor.dtype.is_integer:
2585      raise ValueError(
2586          'Invalid input, not integer. key: {} dtype: {}'.format(
2587              self.key, input_tensor.dtype))
2588
2589    values = math_ops.to_int64(input_tensor.values, name='values')
2590    num_buckets = math_ops.to_int64(self.num_buckets, name='num_buckets')
2591    zero = math_ops.to_int64(0, name='zero')
2592    if self.default_value is None:
2593      # Fail if values are out-of-range.
2594      assert_less = check_ops.assert_less(
2595          values, num_buckets, data=(values, num_buckets),
2596          name='assert_less_than_num_buckets')
2597      assert_greater = check_ops.assert_greater_equal(
2598          values, zero, data=(values,),
2599          name='assert_greater_or_equal_0')
2600      with ops.control_dependencies((assert_less, assert_greater)):
2601        values = array_ops.identity(values)
2602    else:
2603      # Assign default for out-of-range values.
2604      values = array_ops.where(
2605          math_ops.logical_or(
2606              values < zero, values >= num_buckets, name='out_of_range'),
2607          array_ops.fill(
2608              dims=array_ops.shape(values),
2609              value=math_ops.to_int64(self.default_value),
2610              name='default_values'),
2611          values)
2612
2613    return sparse_tensor_lib.SparseTensor(
2614        indices=input_tensor.indices,
2615        values=values,
2616        dense_shape=input_tensor.dense_shape)
2617
2618  @property
2619  def _num_buckets(self):
2620    """Returns number of buckets in this sparse feature."""
2621    return self.num_buckets
2622
2623  def _get_sparse_tensors(
2624      self, inputs, weight_collections=None, trainable=None):
2625    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2626
2627
2628class _WeightedCategoricalColumn(
2629    _CategoricalColumn,
2630    collections.namedtuple('_WeightedCategoricalColumn', (
2631        'categorical_column', 'weight_feature_key', 'dtype'
2632    ))):
2633  """See `weighted_categorical_column`."""
2634
2635  @property
2636  def name(self):
2637    return '{}_weighted_by_{}'.format(
2638        self.categorical_column.name, self.weight_feature_key)
2639
2640  @property
2641  def _parse_example_spec(self):
2642    config = self.categorical_column._parse_example_spec  # pylint: disable=protected-access
2643    if self.weight_feature_key in config:
2644      raise ValueError('Parse config {} already exists for {}.'.format(
2645          config[self.weight_feature_key], self.weight_feature_key))
2646    config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
2647    return config
2648
2649  @property
2650  def _num_buckets(self):
2651    return self.categorical_column._num_buckets  # pylint: disable=protected-access
2652
2653  def _transform_feature(self, inputs):
2654    weight_tensor = inputs.get(self.weight_feature_key)
2655    if weight_tensor is None:
2656      raise ValueError('Missing weights {}.'.format(self.weight_feature_key))
2657    weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
2658        weight_tensor)
2659    if self.dtype != weight_tensor.dtype.base_dtype:
2660      raise ValueError('Bad dtype, expected {}, but got {}.'.format(
2661          self.dtype, weight_tensor.dtype))
2662    if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):
2663      # The weight tensor can be a regular Tensor. In this case, sparsify it.
2664      weight_tensor = _to_sparse_input(weight_tensor, ignore_value=0.0)
2665    if not weight_tensor.dtype.is_floating:
2666      weight_tensor = math_ops.to_float(weight_tensor)
2667    return (inputs.get(self.categorical_column), weight_tensor)
2668
2669  def _get_sparse_tensors(
2670      self, inputs, weight_collections=None, trainable=None):
2671    del weight_collections
2672    del trainable
2673    tensors = inputs.get(self)
2674    return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
2675
2676
2677class _CrossedColumn(
2678    _CategoricalColumn,
2679    collections.namedtuple('_CrossedColumn',
2680                           ['keys', 'hash_bucket_size', 'hash_key'])):
2681  """See `crossed_column`."""
2682
2683  @property
2684  def name(self):
2685    feature_names = []
2686    for key in _collect_leaf_level_keys(self):
2687      if isinstance(key, _FeatureColumn):
2688        feature_names.append(key.name)
2689      else:  # key must be a string
2690        feature_names.append(key)
2691    return '_X_'.join(sorted(feature_names))
2692
2693  @property
2694  def _parse_example_spec(self):
2695    config = {}
2696    for key in self.keys:
2697      if isinstance(key, _FeatureColumn):
2698        config.update(key._parse_example_spec)  # pylint: disable=protected-access
2699      else:  # key must be a string
2700        config.update({key: parsing_ops.VarLenFeature(dtypes.string)})
2701    return config
2702
2703  def _transform_feature(self, inputs):
2704    feature_tensors = []
2705    for key in _collect_leaf_level_keys(self):
2706      if isinstance(key, six.string_types):
2707        feature_tensors.append(inputs.get(key))
2708      elif isinstance(key, _CategoricalColumn):
2709        ids_and_weights = key._get_sparse_tensors(inputs)  # pylint: disable=protected-access
2710        if ids_and_weights.weight_tensor is not None:
2711          raise ValueError(
2712              'crossed_column does not support weight_tensor, but the given '
2713              'column populates weight_tensor. '
2714              'Given column: {}'.format(key.name))
2715        feature_tensors.append(ids_and_weights.id_tensor)
2716      else:
2717        raise ValueError('Unsupported column type. Given: {}'.format(key))
2718    return sparse_ops._sparse_cross_hashed(  # pylint: disable=protected-access
2719        inputs=feature_tensors,
2720        num_buckets=self.hash_bucket_size,
2721        hash_key=self.hash_key)
2722
2723  @property
2724  def _num_buckets(self):
2725    """Returns number of buckets in this sparse feature."""
2726    return self.hash_bucket_size
2727
2728  def _get_sparse_tensors(self, inputs, weight_collections=None,
2729                          trainable=None):
2730    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2731
2732
2733def _collect_leaf_level_keys(cross):
2734  """Collects base keys by expanding all nested crosses.
2735
2736  Args:
2737    cross: A `_CrossedColumn`.
2738
2739  Returns:
2740    A list of strings or `_CategoricalColumn` instances.
2741  """
2742  leaf_level_keys = []
2743  for k in cross.keys:
2744    if isinstance(k, _CrossedColumn):
2745      leaf_level_keys.extend(_collect_leaf_level_keys(k))
2746    else:
2747      leaf_level_keys.append(k)
2748  return leaf_level_keys
2749
2750
2751# TODO(zakaria): Move this to embedding_ops and make it public.
2752def _safe_embedding_lookup_sparse(embedding_weights,
2753                                  sparse_ids,
2754                                  sparse_weights=None,
2755                                  combiner='mean',
2756                                  default_id=None,
2757                                  name=None,
2758                                  partition_strategy='div',
2759                                  max_norm=None):
2760  """Lookup embedding results, accounting for invalid IDs and empty features.
2761
2762  The partitioned embedding in `embedding_weights` must all be the same shape
2763  except for the first dimension. The first dimension is allowed to vary as the
2764  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
2765  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
2766  partitioner.
2767
2768  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
2769  with non-positive weight. For an entry with no features, the embedding vector
2770  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
2771
2772  The ids and weights may be multi-dimensional. Embeddings are always aggregated
2773  along the last dimension.
2774
2775  Args:
2776    embedding_weights:  A list of `P` float `Tensor`s or values representing
2777        partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
2778        created by partitioning along dimension 0.  The total unpartitioned
2779        shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the
2780        vocab size and `e_1, ..., e_m` are the embedding dimensions.
2781    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
2782        ids. `d_0` is typically batch size.
2783    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
2784        float weights corresponding to `sparse_ids`, or `None` if all weights
2785        are be assumed to be 1.0.
2786    combiner: A string specifying how to combine embedding results for each
2787        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
2788        the default.
2789    default_id: The id to use for an entry with no features.
2790    name: A name for this operation (optional).
2791    partition_strategy: A string specifying the partitioning strategy.
2792        Currently `"div"` and `"mod"` are supported. Default is `"div"`.
2793    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
2794        combining.
2795
2796
2797  Returns:
2798    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
2799
2800  Raises:
2801    ValueError: if `embedding_weights` is empty.
2802  """
2803  if embedding_weights is None:
2804    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
2805  if isinstance(embedding_weights, variables.PartitionedVariable):
2806    embedding_weights = list(embedding_weights)  # get underlying Variables.
2807  if not isinstance(embedding_weights, list):
2808    embedding_weights = [embedding_weights]
2809  if len(embedding_weights) < 1:
2810    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
2811
2812  dtype = sparse_weights.dtype if sparse_weights is not None else None
2813  embedding_weights = [
2814      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
2815  ]
2816
2817  with ops.name_scope(name, 'embedding_lookup',
2818                      embedding_weights + [sparse_ids,
2819                                           sparse_weights]) as scope:
2820    # Reshape higher-rank sparse ids and weights to linear segment ids.
2821    original_shape = sparse_ids.dense_shape
2822    original_rank_dim = sparse_ids.dense_shape.get_shape()[0]
2823    original_rank = (
2824        array_ops.size(original_shape)
2825        if original_rank_dim.value is None
2826        else original_rank_dim.value)
2827    sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
2828        math_ops.reduce_prod(
2829            array_ops.slice(original_shape, [0], [original_rank - 1])),
2830        array_ops.gather(original_shape, original_rank - 1)])
2831    if sparse_weights is not None:
2832      sparse_weights = sparse_tensor_lib.SparseTensor(
2833          sparse_ids.indices,
2834          sparse_weights.values, sparse_ids.dense_shape)
2835
2836    # Prune invalid ids and weights.
2837    sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
2838
2839    # Fill in dummy values for empty features, if necessary.
2840    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
2841                                                                 default_id or
2842                                                                 0)
2843    if sparse_weights is not None:
2844      sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)
2845
2846    result = embedding_ops.embedding_lookup_sparse(
2847        embedding_weights,
2848        sparse_ids,
2849        sparse_weights,
2850        combiner=combiner,
2851        partition_strategy=partition_strategy,
2852        name=None if default_id is None else scope,
2853        max_norm=max_norm)
2854
2855    if default_id is None:
2856      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
2857      # for use in Select.
2858      is_row_empty = array_ops.tile(
2859          array_ops.reshape(is_row_empty, [-1, 1]),
2860          array_ops.stack([1, array_ops.shape(result)[1]]))
2861
2862      result = array_ops.where(is_row_empty,
2863                               array_ops.zeros_like(result),
2864                               result,
2865                               name=scope)
2866
2867    # Reshape back from linear ids back into higher-dimensional dense result.
2868    final_result = array_ops.reshape(
2869        result,
2870        array_ops.concat([
2871            array_ops.slice(
2872                math_ops.cast(original_shape, dtypes.int32), [0],
2873                [original_rank - 1]),
2874            array_ops.slice(array_ops.shape(result), [1], [-1])
2875        ], 0))
2876    final_result.set_shape(tensor_shape.unknown_shape(
2877        (original_rank_dim - 1).value).concatenate(result.get_shape()[1:]))
2878    return final_result
2879
2880
2881def _prune_invalid_ids(sparse_ids, sparse_weights):
2882  """Prune invalid IDs (< 0) from the input ids and weights."""
2883  is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
2884  if sparse_weights is not None:
2885    is_id_valid = math_ops.logical_and(
2886        is_id_valid, math_ops.greater(sparse_weights.values, 0))
2887  sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
2888  if sparse_weights is not None:
2889    sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
2890  return sparse_ids, sparse_weights
2891
2892
2893class _IndicatorColumn(_DenseColumn,
2894                       collections.namedtuple('_IndicatorColumn',
2895                                              ['categorical_column'])):
2896  """Represents a one-hot column for use in deep networks.
2897
2898  Args:
2899    categorical_column: A `_CategoricalColumn` which is created by
2900      `categorical_column_with_*` function.
2901  """
2902
2903  @property
2904  def name(self):
2905    return '{}_indicator'.format(self.categorical_column.name)
2906
2907  def _transform_feature(self, inputs):
2908    """Returns dense `Tensor` representing feature.
2909
2910    Args:
2911      inputs: A `_LazyBuilder` object to access inputs.
2912
2913    Returns:
2914      Transformed feature `Tensor`.
2915
2916    Raises:
2917      ValueError: if input rank is not known at graph building time.
2918    """
2919    id_weight_pair = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
2920    id_tensor = id_weight_pair.id_tensor
2921    weight_tensor = id_weight_pair.weight_tensor
2922
2923    # If the underlying column is weighted, return the input as a dense tensor.
2924    if weight_tensor is not None:
2925      weighted_column = sparse_ops.sparse_merge(
2926          sp_ids=id_tensor,
2927          sp_values=weight_tensor,
2928          vocab_size=int(self._variable_shape[-1]))
2929      # Remove (?, -1) index
2930      weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],
2931                                                weighted_column.dense_shape)
2932      return sparse_ops.sparse_tensor_to_dense(weighted_column)
2933
2934    dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
2935        id_tensor, default_value=-1)
2936
2937    # One hot must be float for tf.concat reasons since all other inputs to
2938    # input_layer are float32.
2939    one_hot_id_tensor = array_ops.one_hot(
2940        dense_id_tensor,
2941        depth=self._variable_shape[-1],
2942        on_value=1.0,
2943        off_value=0.0)
2944
2945    # Reduce to get a multi-hot per example.
2946    return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])
2947
2948  @property
2949  def _parse_example_spec(self):
2950    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
2951
2952  @property
2953  def _variable_shape(self):
2954    """Returns a `TensorShape` representing the shape of the dense `Tensor`."""
2955    return tensor_shape.TensorShape([1, self.categorical_column._num_buckets])  # pylint: disable=protected-access
2956
2957  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2958    """Returns dense `Tensor` representing feature.
2959
2960    Args:
2961      inputs: A `_LazyBuilder` object to access inputs.
2962      weight_collections: Unused `weight_collections` since no variables are
2963        created in this function.
2964      trainable: Unused `trainable` bool since no variables are created in
2965        this function.
2966
2967    Returns:
2968      Dense `Tensor` created within `_transform_feature`.
2969    """
2970    # Do nothing with weight_collections and trainable since no variables are
2971    # created in this function.
2972    del weight_collections
2973    del trainable
2974    # Feature has been already transformed. Return the intermediate
2975    # representation created by _transform_feature.
2976    return inputs.get(self)
2977
2978
2979def _verify_static_batch_size_equality(tensors, columns):
2980  # bath_size is a tf.Dimension object.
2981  expected_batch_size = None
2982  for i in range(0, len(tensors)):
2983    if tensors[i].shape[0].value is not None:
2984      if expected_batch_size is None:
2985        bath_size_column_index = i
2986        expected_batch_size = tensors[i].shape[0]
2987      elif not expected_batch_size.is_compatible_with(tensors[i].shape[0]):
2988        raise ValueError(
2989            'Batch size (first dimension) of each feature must be same. '
2990            'Batch size of columns ({}, {}): ({}, {})'.format(
2991                columns[bath_size_column_index].name, columns[i].name,
2992                expected_batch_size, tensors[i].shape[0]))
2993