1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""This API defines FeatureColumn abstraction.
16
17FeatureColumns provide a high level abstraction for ingesting and representing
18features in `Estimator` models.
19
20FeatureColumns are the primary way of encoding features for pre-canned
21`Estimator` models.
22
23When using FeatureColumns with `Estimator` models, the type of feature column
24you should choose depends on (1) the feature type and (2) the model type.
25
26(1) Feature type:
27
28 * Continuous features can be represented by `real_valued_column`.
29 * Categorical features can be represented by any `sparse_column_with_*`
30 column (`sparse_column_with_keys`, `sparse_column_with_vocabulary_file`,
31 `sparse_column_with_hash_bucket`, `sparse_column_with_integerized_feature`).
32
33(2) Model type:
34
35 * Deep neural network models (`DNNClassifier`, `DNNRegressor`).
36
37   Continuous features can be directly fed into deep neural network models.
38
39     age_column = real_valued_column("age")
40
41   To feed sparse features into DNN models, wrap the column with
42   `embedding_column` or `one_hot_column`. `one_hot_column` will create a dense
43   boolean tensor with an entry for each possible value, and thus the
44   computation cost is linear in the number of possible values versus the number
45   of values that occur in the sparse tensor. Thus using a "one_hot_column" is
46   only recommended for features with only a few possible values. For features
47   with many possible values or for very sparse features, `embedding_column` is
48   recommended.
49
50     embedded_dept_column = embedding_column(
51       sparse_column_with_keys("department", ["math", "philosophy", ...]),
52       dimension=10)
53
54* Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
55
56   Sparse features can be fed directly into linear models. When doing so
57   an embedding_lookups are used to efficiently perform the sparse matrix
58   multiplication.
59
60     dept_column = sparse_column_with_keys("department",
61       ["math", "philosophy", "english"])
62
63   It is recommended that continuous features be bucketized before being
64   fed into linear models.
65
66     bucketized_age_column = bucketized_column(
67      source_column=age_column,
68      boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
69
70   Sparse features can be crossed (also known as conjuncted or combined) in
71   order to form non-linearities, and then fed into linear models.
72
73    cross_dept_age_column = crossed_column(
74      columns=[department_column, bucketized_age_column],
75      hash_bucket_size=1000)
76
77Example of building an `Estimator` model using FeatureColumns:
78
79  # Define features and transformations
80  deep_feature_columns = [age_column, embedded_dept_column]
81  wide_feature_columns = [dept_column, bucketized_age_column,
82      cross_dept_age_column]
83
84  # Build deep model
85  estimator = DNNClassifier(
86      feature_columns=deep_feature_columns,
87      hidden_units=[500, 250, 50])
88  estimator.train(...)
89
90  # Or build a wide model
91  estimator = LinearClassifier(
92      feature_columns=wide_feature_columns)
93  estimator.train(...)
94
95  # Or build a wide and deep model!
96  estimator = DNNLinearCombinedClassifier(
97      linear_feature_columns=wide_feature_columns,
98      dnn_feature_columns=deep_feature_columns,
99      dnn_hidden_units=[500, 250, 50])
100  estimator.train(...)
101
102
103FeatureColumns can also be transformed into a generic input layer for
104custom models using `input_from_feature_columns` within
105`feature_column_ops.py`.
106
107Example of building a non-`Estimator` model using FeatureColumns:
108
109  # Building model via layers
110
111  deep_feature_columns = [age_column, embedded_dept_column]
112  columns_to_tensor = parse_feature_columns_from_examples(
113      serialized=my_data,
114      feature_columns=deep_feature_columns)
115  first_layer = input_from_feature_columns(
116      columns_to_tensors=columns_to_tensor,
117      feature_columns=deep_feature_columns)
118  second_layer = fully_connected(first_layer, ...)
119
120See feature_column_ops_test for more examples.
121"""
122
123from __future__ import absolute_import
124from __future__ import division
125from __future__ import print_function
126
127import abc
128import collections
129import math
130
131import six
132
133from tensorflow.contrib import lookup
134from tensorflow.contrib.framework.python.framework import checkpoint_utils
135from tensorflow.contrib.framework.python.framework import experimental
136from tensorflow.contrib.framework.python.ops import variables as contrib_variables
137from tensorflow.contrib.layers.python.layers import embedding_ops
138from tensorflow.contrib.layers.python.layers import layers
139from tensorflow.contrib.layers.python.ops import bucketization_op
140from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op
141from tensorflow.contrib.layers.python.ops import sparse_ops as contrib_sparse_ops
142from tensorflow.python.feature_column import feature_column as fc_core
143from tensorflow.python.framework import dtypes
144from tensorflow.python.framework import ops
145from tensorflow.python.framework import sparse_tensor as sparse_tensor_py
146from tensorflow.python.framework import tensor_shape
147from tensorflow.python.ops import array_ops
148from tensorflow.python.ops import init_ops
149from tensorflow.python.ops import math_ops
150from tensorflow.python.ops import parsing_ops
151from tensorflow.python.ops import resource_variable_ops
152from tensorflow.python.ops import sparse_ops
153from tensorflow.python.ops import string_ops
154from tensorflow.python.ops import variables
155from tensorflow.python.platform import tf_logging as logging
156from tensorflow.python.util import deprecation
157from tensorflow.python.util import nest
158
159
160# Imports the core `InputLayer` symbol in contrib during development.
161InputLayer = fc_core.InputLayer  # pylint: disable=invalid-name
162
163
164class _LinearEmbeddingLookupArguments(
165    collections.namedtuple("_LinearEmbeddingLookupArguments",
166                           ["input_tensor",
167                            "weight_tensor",
168                            "vocab_size",
169                            "initializer",
170                            "combiner"])):
171  """Represents the information needed from a column for embedding lookup.
172
173  Used to compute DNN inputs and weighted sum.
174  """
175  pass
176
177
178class _DeepEmbeddingLookupArguments(
179    collections.namedtuple("_DeepEmbeddingLookupArguments",
180                           ["input_tensor",
181                            "weight_tensor",
182                            "vocab_size",
183                            "initializer",
184                            "combiner",
185                            "dimension",
186                            "shared_embedding_name",
187                            "hash_key",
188                            "max_norm",
189                            "trainable"])):
190  """Represents the information needed from a column for embedding lookup.
191
192  Used to compute DNN inputs and weighted sum.
193  """
194  pass
195
196
197@six.add_metaclass(abc.ABCMeta)
198class _FeatureColumn(object):
199  """Represents a feature column abstraction.
200
201  To distinguish the concept of a feature family and a specific binary feature
202  within a family, we refer to a feature family like "country" as a feature
203  column. For example "country:US" is a feature which is in "country" feature
204  column and has a feature value ("US").
205  This class is an abstract class. User should not create one instance of this.
206  Following classes (_SparseColumn, _RealValuedColumn, ...) are concrete
207  instances.
208  """
209
210  @abc.abstractproperty
211  @deprecation.deprecated(
212      "2016-09-25",
213      "Should be private.")
214  def name(self):
215    """Returns the name of column or transformed column."""
216    pass
217
218  @abc.abstractproperty
219  @deprecation.deprecated(
220      "2016-09-25",
221      "Should be private.")
222  def config(self):
223    """Returns configuration of the base feature for `tf.parse_example`."""
224    pass
225
226  @abc.abstractproperty
227  @deprecation.deprecated(
228      "2016-09-25",
229      "Should be private.")
230  def key(self):
231    """Returns a string which will be used as a key when we do sorting."""
232    pass
233
234  @abc.abstractmethod
235  @deprecation.deprecated(
236      "2016-09-25",
237      "Should be private.")
238  def insert_transformed_feature(self, columns_to_tensors):
239    """Apply transformation and inserts it into columns_to_tensors.
240
241    Args:
242      columns_to_tensors: A mapping from feature columns to tensors. 'string'
243        key means a base feature (not-transformed). It can have _FeatureColumn
244        as a key too. That means that _FeatureColumn is already transformed.
245    """
246    raise NotImplementedError("Transform is not implemented for {}.".format(
247        self))
248
249  # pylint: disable=unused-argument
250  def _to_dnn_input_layer(self,
251                          input_tensor,
252                          weight_collection=None,
253                          trainable=True,
254                          output_rank=2):
255    """Returns a Tensor as an input to the first layer of neural network."""
256    raise ValueError("Calling an abstract method.")
257
258  def _deep_embedding_lookup_arguments(self, input_tensor):
259    """Returns arguments to embedding lookup to build an input layer."""
260    raise NotImplementedError(
261        "No deep embedding lookup arguments for column {}.".format(self))
262
263  # It is expected that classes implement either wide_embedding_lookup_arguments
264  # or to_dense_tensor to be used in linear models.
265  # pylint: disable=unused-argument
266  def _wide_embedding_lookup_arguments(self, input_tensor):
267    """Returns arguments to look up embeddings for this column."""
268    raise NotImplementedError(
269        "No wide embedding lookup arguments for column {}.".format(self))
270
271  # pylint: disable=unused-argument
272  def _to_dense_tensor(self, input_tensor):
273    """Returns a dense tensor representing this column's values."""
274    raise NotImplementedError(
275        "No dense tensor representation for column {}.".format(self))
276
277  def _checkpoint_path(self):
278    """Returns None, or a (path,tensor_name) to load a checkpoint from."""
279    return None
280
281  def _key_without_properties(self, properties):
282    """Helper method for self.key() that omits particular properties."""
283    fields_values = []
284    # pylint: disable=protected-access
285    for i, k in enumerate(self._fields):
286      if k in properties:
287        # Excludes a property from the key.
288        # For instance, exclude `initializer` from the key of EmbeddingColumn
289        # since we don't support users specifying different initializers for
290        # the same embedding column. Ditto for `normalizer` and
291        # RealValuedColumn.
292        # Special treatment is needed since the default str form of a
293        # function contains its address, which could introduce non-determinism
294        # in sorting.
295        continue
296      fields_values.append("{}={}".format(k, self[i]))
297    # pylint: enable=protected-access
298
299    # This is effectively the same format as str(self), except with our special
300    # treatment.
301    return "{}({})".format(type(self).__name__, ", ".join(fields_values))
302
303
304# TODO(b/30410315): Support warm starting in all feature columns.
305class _SparseColumn(
306    _FeatureColumn,
307    fc_core._CategoricalColumn,  # pylint: disable=protected-access
308    collections.namedtuple("_SparseColumn", [
309        "column_name", "is_integerized", "bucket_size", "lookup_config",
310        "combiner", "dtype"
311    ])):
312  """Represents a sparse feature column also known as categorical features.
313
314  Instances of this class are immutable. A sparse column means features are
315  sparse and dictionary returned by InputBuilder contains a
316  ("column_name", SparseTensor) pair.
317  One and only one of bucket_size or lookup_config should be set. If
318  is_integerized is True then bucket_size should be set.
319
320  Attributes:
321    column_name: A string defining sparse column name.
322    is_integerized: A bool if True means type of feature is an integer.
323      Integerized means we can use the feature itself as id.
324    bucket_size: An int that is > 0. The number of buckets.
325    lookup_config: A _SparseIdLookupConfig defining feature-to-id lookup
326      configuration
327    combiner: A string specifying how to reduce if the sparse column is
328      multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
329      the default. "sqrtn" often achieves good accuracy, in particular with
330      bag-of-words columns.
331        * "sum": do not normalize features in the column
332        * "mean": do l1 normalization on features in the column
333        * "sqrtn": do l2 normalization on features in the column
334      For more information: `tf.embedding_lookup_sparse`.
335    dtype: Type of features, either `tf.string` or `tf.int64`.
336
337  Raises:
338    TypeError: if lookup_config is not a _SparseIdLookupConfig.
339    ValueError: if above expectations about input fails.
340  """
341
342  def __new__(cls,
343              column_name,
344              is_integerized=False,
345              bucket_size=None,
346              lookup_config=None,
347              combiner="sum",
348              dtype=dtypes.string):
349    if is_integerized and bucket_size is None:
350      raise ValueError("bucket_size must be set if is_integerized is True. "
351                       "column_name: {}".format(column_name))
352
353    if is_integerized and not dtype.is_integer:
354      raise ValueError("dtype must be an integer if is_integerized is True. "
355                       "dtype: {}, column_name: {}.".format(dtype, column_name))
356    if dtype != dtypes.string and not dtype.is_integer:
357      raise ValueError("dtype must be string or integer. "
358                       "dtype: {}, column_name: {}".format(dtype, column_name))
359
360    if bucket_size is None and lookup_config is None:
361      raise ValueError("one of bucket_size or lookup_config must be set. "
362                       "column_name: {}".format(column_name))
363
364    if bucket_size is not None and lookup_config:
365      raise ValueError("one and only one of bucket_size or lookup_config "
366                       "must be set. column_name: {}".format(column_name))
367
368    if bucket_size is not None and bucket_size < 1:
369      raise ValueError("bucket_size must be at least 1. "
370                       "bucket_size: {}, column_name: {}".format(bucket_size,
371                                                                 column_name))
372
373    if ((lookup_config) and
374        (not isinstance(lookup_config, _SparseIdLookupConfig))):
375      raise TypeError(
376          "lookup_config must be an instance of _SparseIdLookupConfig. "
377          "Given one is in type {} for column_name {}".format(
378              type(lookup_config), column_name))
379
380    if (lookup_config and lookup_config.vocabulary_file and
381        lookup_config.vocab_size is None):
382      raise ValueError("vocab_size must be defined. "
383                       "column_name: {}".format(column_name))
384
385    return super(_SparseColumn, cls).__new__(
386        cls,
387        column_name,
388        is_integerized=is_integerized,
389        bucket_size=bucket_size,
390        lookup_config=lookup_config,
391        combiner=combiner,
392        dtype=dtype)
393
394  @property
395  def name(self):
396    return self.column_name
397
398  @property
399  def length(self):
400    """Returns vocabulary or hash_bucket size."""
401    if self.bucket_size is not None:
402      return self.bucket_size
403    return self.lookup_config.vocab_size + self.lookup_config.num_oov_buckets
404
405  @property
406  def config(self):
407    return {self.column_name: parsing_ops.VarLenFeature(self.dtype)}
408
409  @property
410  def key(self):
411    """Returns a string which will be used as a key when we do sorting."""
412    return "{}".format(self)
413
414  def id_tensor(self, input_tensor):
415    """Returns the id tensor from the given transformed input_tensor."""
416    return input_tensor
417
418  # pylint: disable=unused-argument
419  def weight_tensor(self, input_tensor):
420    """Returns the weight tensor from the given transformed input_tensor."""
421    return None
422
423  # pylint: disable=unused-argument
424  def _to_dnn_input_layer(self,
425                          input_tensor,
426                          weight_collections=None,
427                          trainable=True,
428                          output_rank=2):
429    raise ValueError(
430        "SparseColumn is not supported in DNN. "
431        "Please use embedding_column or one_hot_column. column: {}".format(
432            self))
433
434  def _wide_embedding_lookup_arguments(self, input_tensor):
435    return _LinearEmbeddingLookupArguments(
436        input_tensor=self.id_tensor(input_tensor),
437        weight_tensor=self.weight_tensor(input_tensor),
438        vocab_size=self.length,
439        initializer=init_ops.zeros_initializer(),
440        combiner=self.combiner)
441
442  def _get_input_sparse_tensor(self, input_tensor):
443    """sparsify input_tensor if dense."""
444    if not isinstance(input_tensor, sparse_tensor_py.SparseTensor):
445      # To avoid making any assumptions about which values are to be ignored,
446      # we set ignore_value to -1 for numeric tensors to avoid excluding valid
447      # indices.
448      if input_tensor.dtype == dtypes.string:
449        ignore_value = ""
450      else:
451        ignore_value = -1
452      input_tensor = _reshape_real_valued_tensor(input_tensor, 2, self.name)
453      input_tensor = contrib_sparse_ops.dense_to_sparse_tensor(
454          input_tensor, ignore_value=ignore_value)
455
456    return input_tensor
457
458  def is_compatible(self, other_column):
459    """Check compatibility of two sparse columns."""
460    if self.lookup_config and other_column.lookup_config:
461      return self.lookup_config == other_column.lookup_config
462    compatible = (self.length == other_column.length and
463                  (self.dtype == other_column.dtype or
464                   (self.dtype.is_integer and other_column.dtype.is_integer)))
465    if compatible:
466      logging.warn("Column {} and {} may not have the same vocabulary.".
467                   format(self.name, other_column.name))
468    return compatible
469
470  @abc.abstractmethod
471  def _do_transform(self, input_tensor):
472    pass
473
474  def insert_transformed_feature(self, columns_to_tensors):
475    """Handles sparse column to id conversion."""
476    input_tensor = self._get_input_sparse_tensor(columns_to_tensors[self.name])
477    columns_to_tensors[self] = self._do_transform(input_tensor)
478
479  def _transform_feature(self, inputs):
480    input_tensor = self._get_input_sparse_tensor(inputs.get(self.name))
481    return self._do_transform(input_tensor)
482
483  @property
484  def _parse_example_spec(self):
485    return self.config
486
487  @property
488  def _num_buckets(self):
489    return self.length
490
491  def _get_sparse_tensors(self, inputs, weight_collections=None,
492                          trainable=None):
493    del weight_collections
494    del trainable
495    input_tensor = inputs.get(self)
496    return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
497        self.id_tensor(input_tensor), self.weight_tensor(input_tensor))
498
499
500class _SparseColumnIntegerized(_SparseColumn):
501  """See `sparse_column_with_integerized_feature`."""
502
503  def _do_transform(self, input_tensor):
504    sparse_id_values = math_ops.mod(input_tensor.values, self.bucket_size,
505                                    name="mod")
506    return sparse_tensor_py.SparseTensor(input_tensor.indices, sparse_id_values,
507                                         input_tensor.dense_shape)
508
509
510def sparse_column_with_integerized_feature(column_name,
511                                           bucket_size,
512                                           combiner="sum",
513                                           dtype=dtypes.int64):
514  """Creates an integerized _SparseColumn.
515
516  Use this when your features are already pre-integerized into int64 IDs, that
517  is, when the set of values to output is already coming in as what's desired in
518  the output. Integerized means we can use the feature value itself as id.
519
520  Typically this is used for reading contiguous ranges of integers indexes, but
521  it doesn't have to be. The output value is simply copied from the
522  input_feature, whatever it is. Just be aware, however, that if you have large
523  gaps of unused integers it might affect what you feed those in (for instance,
524  if you make up a one-hot tensor from these, the unused integers will appear as
525  values in the tensor which are always zero.)
526
527  Args:
528    column_name: A string defining sparse column name.
529    bucket_size: An int that is >= 1. The number of buckets. It should be bigger
530      than maximum feature. In other words features in this column should be an
531      int64 in range [0, bucket_size)
532    combiner: A string specifying how to reduce if the sparse column is
533      multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
534      the default. "sqrtn" often achieves good accuracy, in particular with
535      bag-of-words columns.
536        * "sum": do not normalize features in the column
537        * "mean": do l1 normalization on features in the column
538        * "sqrtn": do l2 normalization on features in the column
539      For more information: `tf.embedding_lookup_sparse`.
540    dtype: Type of features. It should be an integer type. Default value is
541      dtypes.int64.
542
543  Returns:
544    An integerized _SparseColumn definition.
545
546  Raises:
547    ValueError: bucket_size is less than 1.
548    ValueError: dtype is not integer.
549  """
550  return _SparseColumnIntegerized(
551      column_name, is_integerized=True, bucket_size=bucket_size,
552      combiner=combiner, dtype=dtype)
553
554
555class _SparseColumnHashed(_SparseColumn):
556  """See `sparse_column_with_hash_bucket`."""
557
558  def __new__(cls,
559              column_name,
560              is_integerized=False,
561              bucket_size=None,
562              lookup_config=None,
563              combiner="sum",
564              dtype=dtypes.string,
565              hash_keys=None):
566    if hash_keys is not None:
567      if not isinstance(hash_keys, list) or not hash_keys:
568        raise ValueError("hash_keys must be a non-empty list.")
569      if (any([not isinstance(key_pair, list) for key_pair in hash_keys]) or
570          any([len(key_pair) != 2 for key_pair in hash_keys]) or
571          any([not isinstance(key, int) for key in nest.flatten(hash_keys)])):
572        raise ValueError(
573            "Each element of hash_keys must be a pair of integers.")
574    obj = super(_SparseColumnHashed, cls).__new__(
575        cls,
576        column_name,
577        is_integerized=is_integerized,
578        bucket_size=bucket_size,
579        lookup_config=lookup_config,
580        combiner=combiner,
581        dtype=dtype)
582    obj.hash_keys = hash_keys
583    return obj
584
585  def _do_transform(self, input_tensor):
586    if self.dtype.is_integer:
587      sparse_values = string_ops.as_string(input_tensor.values)
588    else:
589      sparse_values = input_tensor.values
590
591    if self.hash_keys:
592      result = []
593      for key in self.hash_keys:
594        sparse_id_values = string_ops.string_to_hash_bucket_strong(
595            sparse_values, self.bucket_size, key)
596        result.append(
597            sparse_tensor_py.SparseTensor(input_tensor.indices,
598                                          sparse_id_values,
599                                          input_tensor.dense_shape))
600      return sparse_ops.sparse_concat(axis=1, sp_inputs=result, name="lookup")
601    else:
602      sparse_id_values = string_ops.string_to_hash_bucket_fast(
603          sparse_values, self.bucket_size, name="lookup")
604      return sparse_tensor_py.SparseTensor(
605          input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
606
607
608def sparse_column_with_hash_bucket(column_name,
609                                   hash_bucket_size,
610                                   combiner="sum",
611                                   dtype=dtypes.string,
612                                   hash_keys=None):
613  """Creates a _SparseColumn with hashed bucket configuration.
614
615  Use this when your sparse features are in string or integer format, but you
616  don't have a vocab file that maps each value to an integer ID.
617  output_id = Hash(input_feature_string) % bucket_size
618
619  When hash_keys is set, multiple integer IDs would be created with each key
620  pair in the `hash_keys`. This is useful to reduce the collision of hashed ids.
621
622  Args:
623    column_name: A string defining sparse column name.
624    hash_bucket_size: An int that is > 1. The number of buckets.
625    combiner: A string specifying how to reduce if the sparse column is
626      multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
627      the default. "sqrtn" often achieves good accuracy, in particular with
628      bag-of-words columns.
629        * "sum": do not normalize features in the column
630        * "mean": do l1 normalization on features in the column
631        * "sqrtn": do l2 normalization on features in the column
632      For more information: `tf.embedding_lookup_sparse`.
633    dtype: The type of features. Only string and integer types are supported.
634    hash_keys: The hash keys to use. It is a list of lists of two uint64s. If
635      None, simple and fast hashing algorithm is used. Otherwise, multiple
636      strong hash ids would be produced with each two unit64s in this argument.
637
638  Returns:
639    A _SparseColumn with hashed bucket configuration
640
641  Raises:
642    ValueError: hash_bucket_size is not greater than 2.
643    ValueError: dtype is neither string nor integer.
644  """
645  return _SparseColumnHashed(
646      column_name,
647      bucket_size=hash_bucket_size,
648      combiner=combiner,
649      dtype=dtype,
650      hash_keys=hash_keys)
651
652
653class _SparseColumnKeys(_SparseColumn):
654  """See `sparse_column_with_keys`."""
655
656  def _do_transform(self, input_tensor):
657    table = lookup.index_table_from_tensor(
658        mapping=tuple(self.lookup_config.keys),
659        default_value=self.lookup_config.default_value,
660        dtype=self.dtype,
661        name="lookup")
662    return table.lookup(input_tensor)
663
664
665def sparse_column_with_keys(
666    column_name, keys, default_value=-1, combiner="sum", dtype=dtypes.string):
667  """Creates a _SparseColumn with keys.
668
669  Look up logic is as follows:
670  lookup_id = index_of_feature_in_keys if feature in keys else default_value
671
672  Args:
673    column_name: A string defining sparse column name.
674    keys: A list or tuple defining vocabulary. Must be castable to `dtype`.
675    default_value: The value to use for out-of-vocabulary feature values.
676      Default is -1.
677    combiner: A string specifying how to reduce if the sparse column is
678      multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
679      the default. "sqrtn" often achieves good accuracy, in particular with
680      bag-of-words columns.
681        * "sum": do not normalize features in the column
682        * "mean": do l1 normalization on features in the column
683        * "sqrtn": do l2 normalization on features in the column
684      For more information: `tf.embedding_lookup_sparse`.
685    dtype: Type of features. Only integer and string are supported.
686
687  Returns:
688    A _SparseColumnKeys with keys configuration.
689  """
690  keys = tuple(keys)
691  return _SparseColumnKeys(
692      column_name,
693      lookup_config=_SparseIdLookupConfig(
694          keys=keys, vocab_size=len(keys), default_value=default_value),
695      combiner=combiner,
696      dtype=dtype)
697
698
699class _SparseColumnVocabulary(_SparseColumn):
700  """See `sparse_column_with_vocabulary_file`."""
701
702  def _do_transform(self, st):
703    if self.dtype.is_integer:
704      sparse_string_values = string_ops.as_string(st.values)
705      sparse_string_tensor = sparse_tensor_py.SparseTensor(st.indices,
706                                                           sparse_string_values,
707                                                           st.dense_shape)
708    else:
709      sparse_string_tensor = st
710
711    table = lookup.index_table_from_file(
712        vocabulary_file=self.lookup_config.vocabulary_file,
713        num_oov_buckets=self.lookup_config.num_oov_buckets,
714        vocab_size=self.lookup_config.vocab_size,
715        default_value=self.lookup_config.default_value,
716        name=self.name + "_lookup")
717    return table.lookup(sparse_string_tensor)
718
719
720def sparse_column_with_vocabulary_file(column_name,
721                                       vocabulary_file,
722                                       num_oov_buckets=0,
723                                       vocab_size=None,
724                                       default_value=-1,
725                                       combiner="sum",
726                                       dtype=dtypes.string):
727  """Creates a _SparseColumn with vocabulary file configuration.
728
729  Use this when your sparse features are in string or integer format, and you
730  have a vocab file that maps each value to an integer ID.
731  output_id = LookupIdFromVocab(input_feature_string)
732
733  Args:
734    column_name: A string defining sparse column name.
735    vocabulary_file: The vocabulary filename.
736    num_oov_buckets: The number of out-of-vocabulary buckets. If zero all out of
737      vocabulary features will be ignored.
738    vocab_size: Number of the elements in the vocabulary.
739    default_value: The value to use for out-of-vocabulary feature values.
740      Defaults to -1.
741    combiner: A string specifying how to reduce if the sparse column is
742      multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
743      the default. "sqrtn" often achieves good accuracy, in particular with
744      bag-of-words columns.
745        * "sum": do not normalize features in the column
746        * "mean": do l1 normalization on features in the column
747        * "sqrtn": do l2 normalization on features in the column
748      For more information: `tf.embedding_lookup_sparse`.
749    dtype: The type of features. Only string and integer types are supported.
750
751  Returns:
752    A _SparseColumn with vocabulary file configuration.
753
754  Raises:
755    ValueError: vocab_size is not defined.
756    ValueError: dtype is neither string nor integer.
757  """
758  if vocab_size is None:
759    raise ValueError("vocab_size should be defined. "
760                     "column_name: {}".format(column_name))
761
762  return _SparseColumnVocabulary(
763      column_name,
764      lookup_config=_SparseIdLookupConfig(
765          vocabulary_file=vocabulary_file,
766          num_oov_buckets=num_oov_buckets,
767          vocab_size=vocab_size,
768          default_value=default_value),
769      combiner=combiner,
770      dtype=dtype)
771
772
773class _WeightedSparseColumn(
774    _FeatureColumn,
775    fc_core._CategoricalColumn,  # pylint: disable=protected-access
776    collections.namedtuple("_WeightedSparseColumn",
777                           ["sparse_id_column", "weight_column_name",
778                            "dtype"])):
779  """See `weighted_sparse_column`."""
780
781  def __new__(cls, sparse_id_column, weight_column_name, dtype):
782    return super(_WeightedSparseColumn, cls).__new__(cls, sparse_id_column,
783                                                     weight_column_name, dtype)
784
785  @property
786  def name(self):
787    return "{}_weighted_by_{}".format(self.sparse_id_column.name,
788                                      self.weight_column_name)
789
790  @property
791  def length(self):
792    """Returns id size."""
793    return self.sparse_id_column.length
794
795  @property
796  def config(self):
797    config = _get_feature_config(self.sparse_id_column)
798    config.update(
799        {self.weight_column_name: parsing_ops.VarLenFeature(self.dtype)})
800    return config
801
802  @property
803  def lookup_config(self):
804    return self.sparse_id_column.lookup_config
805
806  @property
807  def key(self):
808    """Returns a string which will be used as a key when we do sorting."""
809    return "{}".format(self)
810
811  def id_tensor(self, input_tensor):
812    """Returns the id tensor from the given transformed input_tensor."""
813    return input_tensor[0]
814
815  def weight_tensor(self, input_tensor):
816    """Returns the weight tensor from the given transformed input_tensor."""
817    return input_tensor[1]
818
819  # pylint: disable=unused-argument
820  def _to_dnn_input_layer(self,
821                          input_tensor,
822                          weight_collections=None,
823                          trainable=True,
824                          output_rank=2):
825    raise ValueError(
826        "WeightedSparseColumn is not supported in DNN. "
827        "Please use embedding_column or one_hot_column. column: {}".format(
828            self))
829
830  def _wide_embedding_lookup_arguments(self, input_tensor):
831    return _LinearEmbeddingLookupArguments(
832        input_tensor=self.id_tensor(input_tensor),
833        weight_tensor=self.weight_tensor(input_tensor),
834        vocab_size=self.length,
835        initializer=init_ops.zeros_initializer(),
836        combiner=self.sparse_id_column.combiner)
837
838  def _do_transform(self, id_tensor, weight_tensor):
839    if not isinstance(weight_tensor, sparse_tensor_py.SparseTensor):
840      # The weight tensor can be a regular Tensor. In such case, sparsify it.
841      weight_tensor = contrib_sparse_ops.dense_to_sparse_tensor(weight_tensor)
842    if not self.dtype.is_floating:
843      weight_tensor = math_ops.cast(weight_tensor, dtypes.float32)
844    return tuple([id_tensor, weight_tensor])
845
846  def insert_transformed_feature(self, columns_to_tensors):
847    """Inserts a tuple with the id and weight tensors."""
848    if self.sparse_id_column not in columns_to_tensors:
849      self.sparse_id_column.insert_transformed_feature(columns_to_tensors)
850
851    weight_tensor = columns_to_tensors[self.weight_column_name]
852    columns_to_tensors[self] = self._do_transform(
853        columns_to_tensors[self.sparse_id_column], weight_tensor)
854
855  def _transform_feature(self, inputs):
856    return self._do_transform(
857        inputs.get(self.sparse_id_column), inputs.get(self.weight_column_name))
858
859  @property
860  def _parse_example_spec(self):
861    return self.config
862
863  @property
864  def _num_buckets(self):
865    return self.length
866
867  def _get_sparse_tensors(self, inputs, weight_collections=None,
868                          trainable=None):
869    del weight_collections
870    del trainable
871    input_tensor = inputs.get(self)
872    return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
873        self.id_tensor(input_tensor), self.weight_tensor(input_tensor))
874
875  def is_compatible(self, other_column):
876    """Check compatibility with other sparse column."""
877    if isinstance(other_column, _WeightedSparseColumn):
878      return self.sparse_id_column.is_compatible(other_column.sparse_id_column)
879    return self.sparse_id_column.is_compatible(other_column)
880
881
882def weighted_sparse_column(sparse_id_column,
883                           weight_column_name,
884                           dtype=dtypes.float32):
885  """Creates a _SparseColumn by combining sparse_id_column with a weight column.
886
887  Example:
888
889    ```python
890    sparse_feature = sparse_column_with_hash_bucket(column_name="sparse_col",
891                                                    hash_bucket_size=1000)
892    weighted_feature = weighted_sparse_column(sparse_id_column=sparse_feature,
893                                              weight_column_name="weights_col")
894    ```
895
896    This configuration assumes that input dictionary of model contains the
897    following two items:
898      * (key="sparse_col", value=sparse_tensor) where sparse_tensor is
899        a SparseTensor.
900      * (key="weights_col", value=weights_tensor) where weights_tensor
901        is a SparseTensor.
902     Following are assumed to be true:
903       * sparse_tensor.indices = weights_tensor.indices
904       * sparse_tensor.dense_shape = weights_tensor.dense_shape
905
906  Args:
907    sparse_id_column: A `_SparseColumn` which is created by
908      `sparse_column_with_*` functions.
909    weight_column_name: A string defining a sparse column name which represents
910      weight or value of the corresponding sparse id feature.
911    dtype: Type of weights, such as `tf.float32`. Only floating and integer
912      weights are supported.
913
914  Returns:
915    A _WeightedSparseColumn composed of two sparse features: one represents id,
916    the other represents weight (value) of the id feature in that example.
917
918  Raises:
919    ValueError: if dtype is not convertible to float.
920  """
921  if not (dtype.is_integer or dtype.is_floating):
922    raise ValueError("dtype is not convertible to float. Given {}".format(
923        dtype))
924
925  return _WeightedSparseColumn(sparse_id_column, weight_column_name, dtype)
926
927
928class _OneHotColumn(
929    _FeatureColumn,
930    fc_core._DenseColumn,  # pylint: disable=protected-access
931    collections.namedtuple("_OneHotColumn", ["sparse_id_column"])):
932  """Represents a one-hot column for use in deep networks.
933
934  Args:
935    sparse_id_column: A _SparseColumn which is created by `sparse_column_with_*`
936      function.
937  """
938
939  @property
940  def name(self):
941    return "{}_one_hot".format(self.sparse_id_column.name)
942
943  @property
944  def length(self):
945    """Returns vocabulary or hash_bucket size."""
946    return self.sparse_id_column.length
947
948  @property
949  def config(self):
950    """Returns the parsing config of the origin column."""
951    return _get_feature_config(self.sparse_id_column)
952
953  @property
954  def key(self):
955    """Returns a string which will be used as a key when we do sorting."""
956    return "{}".format(self)
957
958  def insert_transformed_feature(self, columns_to_tensors):
959    """Used by the Transformer to prevent double transformations."""
960    if self.sparse_id_column not in columns_to_tensors:
961      self.sparse_id_column.insert_transformed_feature(columns_to_tensors)
962    columns_to_tensors[self] = columns_to_tensors[self.sparse_id_column]
963
964  def _to_dnn_input_layer(self,
965                          transformed_input_tensor,
966                          unused_weight_collections=None,
967                          unused_trainable=False,
968                          output_rank=2):
969    """Returns a Tensor as an input to the first layer of neural network.
970
971    Args:
972      transformed_input_tensor: A tensor that has undergone the transformations
973      in `insert_transformed_feature`. Rank should be >= `output_rank`.
974      unused_weight_collections: Unused. One hot encodings are not variable.
975      unused_trainable: Unused. One hot encodings are not trainable.
976      output_rank: the desired rank of the output `Tensor`.
977
978    Returns:
979      A multi-hot Tensor to be fed into the first layer of neural network.
980
981    Raises:
982      ValueError: When using one_hot_column with weighted_sparse_column.
983      This is not yet supported.
984    """
985
986    # Reshape ID column to `output_rank`.
987    sparse_id_column = self.sparse_id_column.id_tensor(transformed_input_tensor)
988    # pylint: disable=protected-access
989    sparse_id_column = layers._inner_flatten(sparse_id_column, output_rank)
990
991    weight_tensor = self.sparse_id_column.weight_tensor(
992        transformed_input_tensor)
993    if weight_tensor is not None:
994      weighted_column = sparse_ops.sparse_merge(sp_ids=sparse_id_column,
995                                                sp_values=weight_tensor,
996                                                vocab_size=self.length)
997      # Remove (?, -1) index
998      weighted_column = sparse_ops.sparse_slice(
999          weighted_column,
1000          array_ops.zeros_like(weighted_column.dense_shape),
1001          weighted_column.dense_shape)
1002      dense_tensor = sparse_ops.sparse_tensor_to_dense(weighted_column)
1003      batch_shape = array_ops.shape(dense_tensor)[:-1]
1004      dense_tensor_shape = array_ops.concat(
1005          [batch_shape, [self.length]], axis=0)
1006      dense_tensor = array_ops.reshape(dense_tensor, dense_tensor_shape)
1007      return dense_tensor
1008
1009    dense_id_tensor = sparse_ops.sparse_tensor_to_dense(sparse_id_column,
1010                                                        default_value=-1)
1011
1012    # One hot must be float for tf.concat reasons since all other inputs to
1013    # input_layer are float32.
1014    one_hot_id_tensor = array_ops.one_hot(
1015        dense_id_tensor, depth=self.length, on_value=1.0, off_value=0.0)
1016
1017    # Reduce to get a multi-hot per example.
1018    return math_ops.reduce_sum(one_hot_id_tensor, axis=[output_rank - 1])
1019
1020  @property
1021  def _variable_shape(self):
1022    return tensor_shape.TensorShape([self.length])
1023
1024  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
1025    del weight_collections
1026    del trainable
1027    return inputs.get(self)
1028
1029  def _transform_feature(self, inputs):
1030    return self._to_dnn_input_layer(inputs.get(self.sparse_id_column))
1031
1032  @property
1033  def _parse_example_spec(self):
1034    return self.config
1035
1036
1037class _EmbeddingColumn(
1038    _FeatureColumn,
1039    fc_core._DenseColumn,  # pylint: disable=protected-access
1040    collections.namedtuple("_EmbeddingColumn", [
1041        "sparse_id_column", "dimension", "combiner", "initializer",
1042        "ckpt_to_load_from", "tensor_name_in_ckpt", "shared_embedding_name",
1043        "shared_vocab_size", "max_norm", "trainable"
1044    ])):
1045  """Represents an embedding column.
1046
1047  Args:
1048    sparse_id_column: A `_SparseColumn` which is created by
1049      `sparse_column_with_*` or `weighted_sparse_column` functions.
1050    dimension: An integer specifying dimension of the embedding.
1051    combiner: A string specifying how to reduce if there are multiple entries
1052      in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
1053      "mean" the default. "sqrtn" often achieves good accuracy, in particular
1054      with bag-of-words columns. Each of this can be thought as example level
1055      normalizations on the column:
1056        * "sum": do not normalize features in the column
1057        * "mean": do l1 normalization on features in the column
1058        * "sqrtn": do l2 normalization on features in the column
1059      For more information: `tf.embedding_lookup_sparse`.
1060    initializer: A variable initializer function to be used in embedding
1061      variable initialization. If not specified, defaults to
1062      `tf.truncated_normal_initializer` with mean 0.0 and standard deviation
1063      1/sqrt(sparse_id_column.length).
1064    ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
1065      to restore the column weights. Required if `tensor_name_in_ckpt` is not
1066      None.
1067    tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
1068      checkpoint from which to restore the column weights. Required if
1069      `ckpt_to_load_from` is not None.
1070    shared_embedding_name: (Optional). The common name for shared embedding.
1071    shared_vocab_size: (Optional). The common vocab_size used for shared
1072      embedding space.
1073    max_norm: (Optional). If not None, embedding values are l2-normalized to
1074      the value of max_norm.
1075    trainable: (Optional). Should the embedding be trainable. Default is True.
1076
1077  Raises:
1078    ValueError: if `initializer` is specified and is not callable. Also,
1079      if only one of `ckpt_to_load_from` and `tensor_name_in_ckpt` is specified.
1080  """
1081
1082  def __new__(cls,
1083              sparse_id_column,
1084              dimension,
1085              combiner="mean",
1086              initializer=None,
1087              ckpt_to_load_from=None,
1088              tensor_name_in_ckpt=None,
1089              shared_embedding_name=None,
1090              shared_vocab_size=None,
1091              max_norm=None,
1092              trainable=True):
1093    if initializer is not None and not callable(initializer):
1094      raise ValueError("initializer must be callable if specified. "
1095                       "Embedding of column_name: {}".format(
1096                           sparse_id_column.name))
1097
1098    if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
1099      raise ValueError("Must specify both `ckpt_to_load_from` and "
1100                       "`tensor_name_in_ckpt` or none of them.")
1101    if initializer is None:
1102      logging.warn("The default stddev value of initializer was changed from "
1103                   "\"1/sqrt(vocab_size)\" to \"1/sqrt(dimension)\" in core "
1104                   "implementation (tf.feature_column.embedding_column).")
1105      stddev = 1 / math.sqrt(sparse_id_column.length)
1106      initializer = init_ops.truncated_normal_initializer(
1107          mean=0.0, stddev=stddev)
1108    return super(_EmbeddingColumn, cls).__new__(cls, sparse_id_column,
1109                                                dimension, combiner,
1110                                                initializer, ckpt_to_load_from,
1111                                                tensor_name_in_ckpt,
1112                                                shared_embedding_name,
1113                                                shared_vocab_size,
1114                                                max_norm,
1115                                                trainable)
1116
1117  @property
1118  def name(self):
1119    if self.shared_embedding_name is None:
1120      return "{}_embedding".format(self.sparse_id_column.name)
1121    else:
1122      return "{}_shared_embedding".format(self.sparse_id_column.name)
1123
1124  @property
1125  def length(self):
1126    """Returns id size."""
1127    if self.shared_vocab_size is None:
1128      return self.sparse_id_column.length
1129    else:
1130      return self.shared_vocab_size
1131
1132  @property
1133  def config(self):
1134    return _get_feature_config(self.sparse_id_column)
1135
1136  @property
1137  def key(self):
1138    """Returns a string which will be used as a key when we do sorting."""
1139    return self._key_without_properties(["initializer"])
1140
1141  def insert_transformed_feature(self, columns_to_tensors):
1142    if self.sparse_id_column not in columns_to_tensors:
1143      self.sparse_id_column.insert_transformed_feature(columns_to_tensors)
1144    columns_to_tensors[self] = columns_to_tensors[self.sparse_id_column]
1145
1146  def _deep_embedding_lookup_arguments(self, input_tensor):
1147    return _DeepEmbeddingLookupArguments(
1148        input_tensor=self.sparse_id_column.id_tensor(input_tensor),
1149        weight_tensor=self.sparse_id_column.weight_tensor(input_tensor),
1150        vocab_size=self.length,
1151        dimension=self.dimension,
1152        initializer=self.initializer,
1153        combiner=self.combiner,
1154        shared_embedding_name=self.shared_embedding_name,
1155        hash_key=None,
1156        max_norm=self.max_norm,
1157        trainable=self.trainable)
1158
1159  def _checkpoint_path(self):
1160    if self.ckpt_to_load_from is not None:
1161      return self.ckpt_to_load_from, self.tensor_name_in_ckpt
1162    return None
1163
1164  # pylint: disable=unused-argument
1165  def _wide_embedding_lookup_arguments(self, input_tensor):
1166    raise ValueError("Column {} is not supported in linear models. "
1167                     "Please use sparse_column.".format(self))
1168
1169  @property
1170  def _variable_shape(self):
1171    return tensor_shape.TensorShape([self.dimension])
1172
1173  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
1174    return _embeddings_from_arguments(
1175        self,
1176        self._deep_embedding_lookup_arguments(inputs.get(self)),
1177        weight_collections, trainable)
1178
1179  def _transform_feature(self, inputs):
1180    return inputs.get(self.sparse_id_column)
1181
1182  @property
1183  def _parse_example_spec(self):
1184    return self.config
1185
1186
1187def _is_variable(v):
1188  """Returns true if `v` is a variable."""
1189  return isinstance(v, (variables.Variable,
1190                        resource_variable_ops.ResourceVariable))
1191
1192
1193def _embeddings_from_arguments(column,
1194                               args,
1195                               weight_collections,
1196                               trainable,
1197                               output_rank=2):
1198  """Returns embeddings for a column based on the computed arguments.
1199
1200  Args:
1201   column: the column name.
1202   args: the _DeepEmbeddingLookupArguments for this column.
1203   weight_collections: collections to store weights in.
1204   trainable: whether these embeddings should be trainable.
1205   output_rank: the desired rank of the returned `Tensor`. Inner dimensions will
1206     be combined to produce the desired rank.
1207
1208  Returns:
1209   the embeddings.
1210
1211  Raises:
1212   ValueError: if not possible to create.
1213  """
1214  # pylint: disable=protected-access
1215  input_tensor = layers._inner_flatten(args.input_tensor, output_rank)
1216  weight_tensor = None
1217  if args.weight_tensor is not None:
1218    weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank)
1219  # pylint: enable=protected-access
1220
1221  # This option is only enabled for scattered_embedding_column.
1222  if args.hash_key:
1223    embeddings = contrib_variables.model_variable(
1224        name="weights",
1225        shape=[args.vocab_size],
1226        dtype=dtypes.float32,
1227        initializer=args.initializer,
1228        trainable=(trainable and args.trainable),
1229        collections=weight_collections)
1230
1231    return embedding_ops.scattered_embedding_lookup_sparse(
1232        embeddings,
1233        input_tensor,
1234        args.dimension,
1235        hash_key=args.hash_key,
1236        combiner=args.combiner,
1237        name="lookup")
1238
1239  if args.shared_embedding_name is not None:
1240    shared_embedding_collection_name = (
1241        "SHARED_EMBEDDING_COLLECTION_" + args.shared_embedding_name.upper())
1242    graph = ops.get_default_graph()
1243    shared_embedding_collection = (
1244        graph.get_collection_ref(shared_embedding_collection_name))
1245    shape = [args.vocab_size, args.dimension]
1246    if shared_embedding_collection:
1247      if len(shared_embedding_collection) > 1:
1248        raise ValueError(
1249            "Collection %s can only contain one "
1250            "(partitioned) variable." % shared_embedding_collection_name)
1251      else:
1252        embeddings = shared_embedding_collection[0]
1253        if embeddings.get_shape() != shape:
1254          raise ValueError(
1255              "The embedding variable with name {} already "
1256              "exists, but its shape does not match required "
1257              "embedding shape here. Please make sure to use "
1258              "different shared_embedding_name for different "
1259              "shared embeddings.".format(args.shared_embedding_name))
1260    else:
1261      embeddings = contrib_variables.model_variable(
1262          name=args.shared_embedding_name,
1263          shape=shape,
1264          dtype=dtypes.float32,
1265          initializer=args.initializer,
1266          trainable=(trainable and args.trainable),
1267          collections=weight_collections)
1268      graph.add_to_collection(shared_embedding_collection_name, embeddings)
1269  else:
1270    embeddings = contrib_variables.model_variable(
1271        name="weights",
1272        shape=[args.vocab_size, args.dimension],
1273        dtype=dtypes.float32,
1274        initializer=args.initializer,
1275        trainable=(trainable and args.trainable),
1276        collections=weight_collections)
1277
1278  if _is_variable(embeddings):
1279    embeddings = [embeddings]
1280  else:
1281    embeddings = embeddings._get_variable_list()  # pylint: disable=protected-access
1282  # pylint: disable=protected-access
1283  _maybe_restore_from_checkpoint(column._checkpoint_path(), embeddings)
1284  return embedding_ops.safe_embedding_lookup_sparse(
1285      embeddings,
1286      input_tensor,
1287      sparse_weights=weight_tensor,
1288      combiner=args.combiner,
1289      name=column.name + "weights",
1290      max_norm=args.max_norm)
1291
1292
1293def _maybe_restore_from_checkpoint(checkpoint_path, variable):
1294  if checkpoint_path is not None:
1295    path, tensor_name = checkpoint_path
1296    weights_to_restore = variable
1297    if len(variable) == 1:
1298      weights_to_restore = variable[0]
1299    checkpoint_utils.init_from_checkpoint(path,
1300                                          {tensor_name: weights_to_restore})
1301
1302
1303def one_hot_column(sparse_id_column):
1304  """Creates an `_OneHotColumn` for a one-hot or multi-hot repr in a DNN.
1305
1306  Args:
1307      sparse_id_column: A _SparseColumn which is created by
1308        `sparse_column_with_*`
1309        or crossed_column functions. Note that `combiner` defined in
1310        `sparse_id_column` is ignored.
1311
1312  Returns:
1313    An _OneHotColumn.
1314  """
1315  return _OneHotColumn(sparse_id_column)
1316
1317
1318def embedding_column(sparse_id_column,
1319                     dimension,
1320                     combiner="mean",
1321                     initializer=None,
1322                     ckpt_to_load_from=None,
1323                     tensor_name_in_ckpt=None,
1324                     max_norm=None,
1325                     trainable=True):
1326  """Creates an `_EmbeddingColumn` for feeding sparse data into a DNN.
1327
1328  Args:
1329    sparse_id_column: A `_SparseColumn` which is created by for example
1330      `sparse_column_with_*` or crossed_column functions. Note that `combiner`
1331      defined in `sparse_id_column` is ignored.
1332    dimension: An integer specifying dimension of the embedding.
1333    combiner: A string specifying how to reduce if there are multiple entries
1334      in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
1335      "mean" the default. "sqrtn" often achieves good accuracy, in particular
1336      with bag-of-words columns. Each of this can be thought as example level
1337      normalizations on the column:
1338        * "sum": do not normalize
1339        * "mean": do l1 normalization
1340        * "sqrtn": do l2 normalization
1341      For more information: `tf.embedding_lookup_sparse`.
1342    initializer: A variable initializer function to be used in embedding
1343      variable initialization. If not specified, defaults to
1344      `tf.truncated_normal_initializer` with mean 0.0 and standard deviation
1345      1/sqrt(sparse_id_column.length).
1346    ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
1347      to restore the column weights. Required if `tensor_name_in_ckpt` is not
1348      None.
1349    tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
1350      checkpoint from which to restore the column weights. Required if
1351      `ckpt_to_load_from` is not None.
1352    max_norm: (Optional). If not None, embedding values are l2-normalized to
1353      the value of max_norm.
1354    trainable: (Optional). Should the embedding be trainable. Default is True
1355
1356  Returns:
1357    An `_EmbeddingColumn`.
1358  """
1359  return _EmbeddingColumn(sparse_id_column, dimension, combiner, initializer,
1360                          ckpt_to_load_from, tensor_name_in_ckpt,
1361                          max_norm=max_norm, trainable=trainable)
1362
1363
1364def shared_embedding_columns(sparse_id_columns,
1365                             dimension,
1366                             combiner="mean",
1367                             shared_embedding_name=None,
1368                             initializer=None,
1369                             ckpt_to_load_from=None,
1370                             tensor_name_in_ckpt=None,
1371                             max_norm=None,
1372                             trainable=True):
1373  """Creates a list of `_EmbeddingColumn` sharing the same embedding.
1374
1375  Args:
1376    sparse_id_columns: An iterable of `_SparseColumn`, such as those created by
1377      `sparse_column_with_*` or crossed_column functions. Note that `combiner`
1378      defined in each sparse_id_column is ignored.
1379    dimension: An integer specifying dimension of the embedding.
1380    combiner: A string specifying how to reduce if there are multiple entries
1381      in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
1382      "mean" the default. "sqrtn" often achieves good accuracy, in particular
1383      with bag-of-words columns. Each of this can be thought as example level
1384      normalizations on the column:
1385        * "sum": do not normalize
1386        * "mean": do l1 normalization
1387        * "sqrtn": do l2 normalization
1388      For more information: `tf.embedding_lookup_sparse`.
1389    shared_embedding_name: (Optional). A string specifying the name of shared
1390      embedding weights. This will be needed if you want to reference the shared
1391      embedding separately from the generated `_EmbeddingColumn`.
1392    initializer: A variable initializer function to be used in embedding
1393      variable initialization. If not specified, defaults to
1394      `tf.truncated_normal_initializer` with mean 0.0 and standard deviation
1395      1/sqrt(sparse_id_columns[0].length).
1396    ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
1397      to restore the column weights. Required if `tensor_name_in_ckpt` is not
1398      None.
1399    tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
1400      checkpoint from which to restore the column weights. Required if
1401      `ckpt_to_load_from` is not None.
1402    max_norm: (Optional). If not None, embedding values are l2-normalized to
1403      the value of max_norm.
1404    trainable: (Optional). Should the embedding be trainable. Default is True
1405
1406  Returns:
1407    A tuple of `_EmbeddingColumn` with shared embedding space.
1408
1409  Raises:
1410    ValueError: if sparse_id_columns is empty, or its elements are not
1411      compatible with each other.
1412    TypeError: if `sparse_id_columns` is not a sequence or is a string. If at
1413      least one element of `sparse_id_columns` is not a `SparseColumn` or a
1414      `WeightedSparseColumn`.
1415  """
1416  if (not isinstance(sparse_id_columns, collections.Sequence) or
1417      isinstance(sparse_id_columns, six.string_types)):
1418    raise TypeError(
1419        "sparse_id_columns must be a non-string sequence (ex: list or tuple) "
1420        "instead of type {}.".format(type(sparse_id_columns)))
1421  if len(sparse_id_columns) < 1:
1422    raise ValueError("The input sparse_id_columns should have at least one "
1423                     "element.")
1424  for sparse_id_column in sparse_id_columns:
1425    if not (isinstance(sparse_id_column, _SparseColumn) or
1426            isinstance(sparse_id_column, _WeightedSparseColumn)):
1427      raise TypeError("Elements of sparse_id_columns must be _SparseColumn or "
1428                      "_WeightedSparseColumn, but {} is not."
1429                      .format(sparse_id_column))
1430
1431  if len(sparse_id_columns) == 1:
1432    return [
1433        _EmbeddingColumn(sparse_id_columns[0], dimension, combiner, initializer,
1434                         ckpt_to_load_from, tensor_name_in_ckpt,
1435                         shared_embedding_name, max_norm=max_norm,
1436                         trainable=trainable)]
1437  else:
1438    # Check compatibility of sparse_id_columns
1439    compatible = True
1440    for column in sparse_id_columns[1:]:
1441      if isinstance(sparse_id_columns[0], _WeightedSparseColumn):
1442        compatible = compatible and sparse_id_columns[0].is_compatible(column)
1443      else:
1444        compatible = compatible and column.is_compatible(sparse_id_columns[0])
1445    if not compatible:
1446      raise ValueError("The input sparse id columns are not compatible.")
1447    # Construct the shared name and size for shared embedding space.
1448    if not shared_embedding_name:
1449      # Sort the columns so that shared_embedding_name will be deterministic
1450      # even if users pass in unsorted columns from a dict or something.
1451      # Since they are different classes, ordering is SparseColumns first,
1452      # then WeightedSparseColumns.
1453      sparse_columns = []
1454      weighted_sparse_columns = []
1455      for column in sparse_id_columns:
1456        if isinstance(column, _SparseColumn):
1457          sparse_columns.append(column)
1458        else:
1459          weighted_sparse_columns.append(column)
1460      sorted_columns = sorted(sparse_columns) + sorted(
1461          weighted_sparse_columns, key=lambda x: x.name)
1462      if len(sorted_columns) <= 3:
1463        shared_embedding_name = "_".join([column.name
1464                                          for column in sorted_columns])
1465      else:
1466        shared_embedding_name = "_".join([column.name
1467                                          for column in sorted_columns[0:3]])
1468        shared_embedding_name += (
1469            "_plus_{}_others".format(len(sorted_columns) - 3))
1470      shared_embedding_name += "_shared_embedding"
1471    shared_vocab_size = sparse_id_columns[0].length
1472
1473    embedded_columns = []
1474    for column in sparse_id_columns:
1475      embedded_columns.append(
1476          _EmbeddingColumn(column, dimension, combiner, initializer,
1477                           ckpt_to_load_from, tensor_name_in_ckpt,
1478                           shared_embedding_name, shared_vocab_size,
1479                           max_norm=max_norm, trainable=trainable))
1480    return tuple(embedded_columns)
1481
1482
1483class _ScatteredEmbeddingColumn(
1484    _FeatureColumn,
1485    fc_core._DenseColumn,  # pylint: disable=protected-access
1486    collections.namedtuple("_ScatteredEmbeddingColumn", [
1487        "column_name", "size", "dimension", "hash_key", "combiner",
1488        "initializer"
1489    ])):
1490  """See `scattered_embedding_column`."""
1491
1492  def __new__(cls,
1493              column_name,
1494              size,
1495              dimension,
1496              hash_key,
1497              combiner="sqrtn",
1498              initializer=None):
1499    if initializer is not None and not callable(initializer):
1500      raise ValueError("initializer must be callable if specified. "
1501                       "column_name: {}".format(column_name))
1502    if initializer is None:
1503      stddev = 0.1
1504      initializer = init_ops.truncated_normal_initializer(
1505          mean=0.0, stddev=stddev)
1506    return super(_ScatteredEmbeddingColumn, cls).__new__(cls, column_name, size,
1507                                                         dimension, hash_key,
1508                                                         combiner,
1509                                                         initializer)
1510
1511  @property
1512  def name(self):
1513    return "{}_scattered_embedding".format(self.column_name)
1514
1515  @property
1516  def config(self):
1517    return {self.column_name: parsing_ops.VarLenFeature(dtypes.string)}
1518
1519  @property
1520  def key(self):
1521    """Returns a string which will be used as a key when we do sorting."""
1522    return self._key_without_properties(["initializer"])
1523
1524  def insert_transformed_feature(self, columns_to_tensors):
1525    columns_to_tensors[self] = columns_to_tensors[self.column_name]
1526
1527  def _deep_embedding_lookup_arguments(self, input_tensor):
1528    return _DeepEmbeddingLookupArguments(
1529        input_tensor=input_tensor,
1530        weight_tensor=None,
1531        vocab_size=self.size,
1532        initializer=self.initializer,
1533        combiner=self.combiner,
1534        dimension=self.dimension,
1535        shared_embedding_name=None,
1536        hash_key=self.hash_key,
1537        max_norm=None,
1538        trainable=True)
1539
1540  @property
1541  def _variable_shape(self):
1542    return tensor_shape.TensorShape([self.dimension])
1543
1544  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
1545    return _embeddings_from_arguments(
1546        self,
1547        self._deep_embedding_lookup_arguments(inputs.get(self)),
1548        weight_collections, trainable)
1549
1550  def _transform_feature(self, inputs):
1551    return inputs.get(self.column_name)
1552
1553  @property
1554  def _parse_example_spec(self):
1555    return self.config
1556
1557
1558def scattered_embedding_column(column_name,
1559                               size,
1560                               dimension,
1561                               hash_key,
1562                               combiner="mean",
1563                               initializer=None):
1564  """Creates an embedding column of a sparse feature using parameter hashing.
1565
1566  This is a useful shorthand when you have a sparse feature you want to use an
1567  embedding for, but also want to hash the embedding's values in each dimension
1568  to a variable based on a different hash.
1569
1570  Specifically, the i-th embedding component of a value v is found by retrieving
1571  an embedding weight whose index is a fingerprint of the pair (v,i).
1572
1573  An embedding column with sparse_column_with_hash_bucket such as
1574
1575      embedding_column(
1576        sparse_column_with_hash_bucket(column_name, bucket_size),
1577        dimension)
1578
1579  could be replaced by
1580
1581      scattered_embedding_column(
1582        column_name,
1583        size=bucket_size * dimension,
1584        dimension=dimension,
1585        hash_key=tf.contrib.layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY)
1586
1587  for the same number of embedding parameters. This should hopefully reduce the
1588  impact of collisions, but adds the cost of slowing down training.
1589
1590  Args:
1591    column_name: A string defining sparse column name.
1592    size: An integer specifying the number of parameters in the embedding layer.
1593    dimension: An integer specifying dimension of the embedding.
1594    hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
1595      function to combine the crosses fingerprints on SparseFeatureCrossOp.
1596    combiner: A string specifying how to reduce if there are multiple entries
1597      in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
1598      "mean" the default. "sqrtn" often achieves good accuracy, in particular
1599      with bag-of-words columns. Each of this can be thought as example level
1600      normalizations on the column:
1601        * "sum": do not normalize features in the column
1602        * "mean": do l1 normalization on features in the column
1603        * "sqrtn": do l2 normalization on features in the column
1604      For more information: `tf.embedding_lookup_sparse`.
1605    initializer: A variable initializer function to be used in embedding
1606      variable initialization. If not specified, defaults to
1607      `tf.truncated_normal_initializer` with mean 0 and standard deviation 0.1.
1608
1609  Returns:
1610    A _ScatteredEmbeddingColumn.
1611
1612  Raises:
1613    ValueError: if dimension or size is not a positive integer; or if combiner
1614      is not supported.
1615
1616  """
1617  if (dimension < 1) or (size < 1):
1618    raise ValueError("Dimension and size must be greater than 0. "
1619                     "dimension: {}, size: {}, column_name: {}".format(
1620                         dimension, size, column_name))
1621
1622  if combiner not in ("mean", "sqrtn", "sum"):
1623    raise ValueError("Combiner must be one of 'mean', 'sqrtn' or 'sum'. "
1624                     "combiner: {}, column_name: {}".format(combiner,
1625                                                            column_name))
1626
1627  return _ScatteredEmbeddingColumn(column_name, size, dimension, hash_key,
1628                                   combiner, initializer)
1629
1630
1631def _reshape_real_valued_tensor(input_tensor, output_rank, column_name=None):
1632  """Reshaping logic for dense, numeric `Tensors`.
1633
1634  Follows the following rules:
1635    1. If `output_rank > input_rank + 1` raise a `ValueError`.
1636    2. If `output_rank == input_rank + 1`, expand `input_tensor` by one
1637       dimension and return
1638    3. If `output_rank == input_rank`, return `input_tensor`.
1639    4. If `output_rank < input_rank`, flatten the inner dimensions of
1640       `input_tensor` and return a `Tensor` with `output_rank`
1641
1642  Args:
1643    input_tensor: a dense `Tensor` to be reshaped.
1644    output_rank: the desired rank of the reshaped `Tensor`.
1645    column_name: (optional) the name of the associated column. Used for error
1646      messages.
1647  Returns:
1648    A `Tensor` with the same entries as `input_tensor` and rank `output_rank`.
1649  Raises:
1650    ValueError: if `output_rank > input_rank + 1`.
1651  """
1652  input_rank = input_tensor.get_shape().ndims
1653  if input_rank is not None:
1654    if output_rank > input_rank + 1:
1655      error_string = ("Rank of input Tensor ({}) should be the same as "
1656                      "output_rank ({}). For example, sequence data should "
1657                      "typically be 3 dimensional (rank 3) while non-sequence "
1658                      "data is typically 2 dimensional (rank 2).".format(
1659                          input_rank, output_rank))
1660      if column_name is not None:
1661        error_string = ("Error while processing column {}.".format(column_name)
1662                        + error_string)
1663      raise ValueError(error_string)
1664    if output_rank == input_rank + 1:
1665      logging.warning(
1666          "Rank of input Tensor ({}) should be the same as output_rank ({}) "
1667          "for column. Will attempt to expand dims. It is highly recommended "
1668          "that you resize your input, as this behavior may change.".format(
1669              input_rank, output_rank))
1670      return array_ops.expand_dims(input_tensor, -1, name="expand_dims")
1671    if output_rank == input_rank:
1672      return input_tensor
1673  # Here, either `input_rank` is unknown or it is greater than `output_rank`.
1674  return layers._inner_flatten(input_tensor, output_rank)  # pylint: disable=protected-access
1675
1676
1677class _RealValuedVarLenColumn(_FeatureColumn, collections.namedtuple(
1678    "_RealValuedVarLenColumn",
1679    ["column_name", "default_value", "dtype", "normalizer", "is_sparse"])):
1680  """Represents a real valued feature column for variable length Features.
1681
1682  Instances of this class are immutable.
1683  If is_sparse=False, the dictionary returned by InputBuilder contains a
1684  ("column_name", Tensor) pair with a Tensor shape of (batch_size, dimension).
1685  If is_sparse=True, the dictionary contains a ("column_name", SparseTensor)
1686  pair instead with shape inferred after parsing.
1687  """
1688
1689  @property
1690  def name(self):
1691    return self.column_name
1692
1693  @property
1694  def config(self):
1695    if self.is_sparse:
1696      return {self.column_name: parsing_ops.VarLenFeature(self.dtype)}
1697    else:
1698      return {self.column_name: parsing_ops.FixedLenSequenceFeature(
1699          [], self.dtype, allow_missing=True,
1700          default_value=self.default_value)}
1701
1702  @property
1703  def key(self):
1704    """Returns a string which will be used as a key when we do sorting."""
1705    return self._key_without_properties(["normalizer"])
1706
1707  @property
1708  def normalizer_fn(self):
1709    """Returns the function used to normalize the column."""
1710    return self.normalizer
1711
1712  def _normalized_input_tensor(self, input_tensor):
1713    """Returns the input tensor after custom normalization is applied."""
1714    if self.normalizer is None:
1715      return input_tensor
1716    if self.is_sparse:
1717      return sparse_tensor_py.SparseTensor(
1718          input_tensor.indices,
1719          self.normalizer(input_tensor.values),
1720          input_tensor.dense_shape)
1721    else:
1722      return self.normalizer(input_tensor)
1723
1724  def insert_transformed_feature(self, columns_to_tensors):
1725    """Apply transformation and inserts it into columns_to_tensors.
1726
1727    Args:
1728      columns_to_tensors: A mapping from feature columns to tensors. 'string'
1729        key means a base feature (not-transformed). It can have _FeatureColumn
1730        as a key too. That means that _FeatureColumn is already transformed.
1731    """
1732    # Transform the input tensor according to the normalizer function.
1733    input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name])
1734    columns_to_tensors[self] = math_ops.cast(input_tensor, dtypes.float32)
1735
1736  # pylint: disable=unused-argument
1737  def _to_dnn_input_layer(self,
1738                          input_tensor,
1739                          weight_collections=None,
1740                          trainable=True,
1741                          output_rank=2):
1742    return _reshape_real_valued_tensor(
1743        self._to_dense_tensor(input_tensor), output_rank, self.name)
1744
1745  def _to_dense_tensor(self, input_tensor):
1746    if not self.is_sparse:
1747      return input_tensor
1748    raise ValueError("Set is_sparse to False if you want a dense Tensor for "
1749                     "column_name: {}".format(self.name))
1750
1751
1752@experimental
1753def _real_valued_var_len_column(column_name,
1754                                default_value=None,
1755                                dtype=dtypes.float32,
1756                                normalizer=None,
1757                                is_sparse=False):
1758  """Creates a `_RealValuedVarLenColumn` for variable-length numeric data.
1759
1760  Note, this is not integrated with any of the DNNEstimators, except the RNN
1761  ones DynamicRNNEstimator and the StateSavingRNNEstimator.
1762
1763  It can either create a parsing config for a SparseTensor (with is_sparse=True)
1764  or a padded Tensor.
1765  The (dense_)shape of the result will be [batch_size, None], which can be used
1766  with is_sparse=False as input into an RNN (see DynamicRNNEstimator or
1767  StateSavingRNNEstimator) or with is_sparse=True as input into a tree (see
1768  gtflow).
1769
1770  Use real_valued_column if the Feature has a fixed length. Use some
1771  SparseColumn for columns to be embedded / one-hot-encoded.
1772
1773  Args:
1774    column_name: A string defining real valued column name.
1775    default_value: A scalar value compatible with dtype. Needs to be specified
1776      if is_sparse=False.
1777    dtype: Defines the type of values. Default value is tf.float32. Needs to be
1778      convertible to tf.float32.
1779    normalizer: If not None, a function that can be used to normalize the value
1780      of the real valued column after default_value is applied for parsing.
1781      Normalizer function takes the input tensor as its argument, and returns
1782      the output tensor. (e.g. lambda x: (x - 3.0) / 4.2). Note that for
1783      is_sparse=False, the normalizer will be run on the values of the
1784      `SparseTensor`.
1785    is_sparse: A boolean defining whether to create a SparseTensor or a Tensor.
1786  Returns:
1787    A _RealValuedSparseColumn.
1788  Raises:
1789    TypeError: if default_value is not a scalar value compatible with dtype.
1790    TypeError: if dtype is not convertible to tf.float32.
1791    ValueError: if default_value is None and is_sparse is False.
1792  """
1793  if not (dtype.is_integer or dtype.is_floating):
1794    raise TypeError("dtype must be convertible to float. "
1795                    "dtype: {}, column_name: {}".format(dtype, column_name))
1796
1797  if default_value is None and not is_sparse:
1798    raise ValueError("default_value must be provided when is_sparse=False to "
1799                     "parse a padded Tensor. "
1800                     "column_name: {}".format(column_name))
1801  if isinstance(default_value, list):
1802    raise ValueError(
1803        "Only scalar default value. default_value: {}, column_name: {}".format(
1804            default_value, column_name))
1805  if default_value is not None:
1806    if dtype.is_integer:
1807      default_value = int(default_value)
1808    elif dtype.is_floating:
1809      default_value = float(default_value)
1810
1811  return _RealValuedVarLenColumn(column_name, default_value, dtype, normalizer,
1812                                 is_sparse)
1813
1814
1815class _RealValuedColumn(
1816    _FeatureColumn,
1817    fc_core._DenseColumn,  # pylint: disable=protected-access
1818    collections.namedtuple(
1819        "_RealValuedColumn",
1820        ["column_name", "dimension", "default_value", "dtype", "normalizer"])):
1821  """Represents a real valued feature column also known as continuous features.
1822
1823  Instances of this class are immutable. The dictionary returned by InputBuilder
1824  contains a ("column_name", Tensor) pair with a Tensor shape of
1825  (batch_size, dimension).
1826  """
1827
1828  def __new__(cls, column_name, dimension, default_value,
1829              dtype, normalizer):
1830    if default_value is not None:
1831      default_value = tuple(default_value)
1832    return super(_RealValuedColumn, cls).__new__(cls, column_name, dimension,
1833                                                 default_value, dtype,
1834                                                 normalizer)
1835
1836  @property
1837  def name(self):
1838    return self.column_name
1839
1840  @property
1841  def config(self):
1842    default_value = self.default_value
1843    if default_value is not None:
1844      default_value = list(default_value)
1845    return {self.column_name: parsing_ops.FixedLenFeature([self.dimension],
1846                                                          self.dtype,
1847                                                          default_value)}
1848
1849  @property
1850  def key(self):
1851    """Returns a string which will be used as a key when we do sorting."""
1852    return self._key_without_properties(["normalizer"])
1853
1854  @property
1855  def normalizer_fn(self):
1856    """Returns the function used to normalize the column."""
1857    return self.normalizer
1858
1859  def _normalized_input_tensor(self, input_tensor):
1860    """Returns the input tensor after custom normalization is applied."""
1861    return (self.normalizer(input_tensor) if self.normalizer is not None else
1862            input_tensor)
1863
1864  def insert_transformed_feature(self, columns_to_tensors):
1865    """Apply transformation and inserts it into columns_to_tensors.
1866
1867    Args:
1868      columns_to_tensors: A mapping from feature columns to tensors. 'string'
1869        key means a base feature (not-transformed). It can have _FeatureColumn
1870        as a key too. That means that _FeatureColumn is already transformed.
1871    """
1872    # Transform the input tensor according to the normalizer function.
1873    input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name])
1874    columns_to_tensors[self] = math_ops.cast(input_tensor, dtypes.float32)
1875
1876  # pylint: disable=unused-argument
1877  def _to_dnn_input_layer(self,
1878                          input_tensor,
1879                          weight_collections=None,
1880                          trainable=True,
1881                          output_rank=2):
1882    input_tensor = self._to_dense_tensor(input_tensor)
1883    if input_tensor.dtype != dtypes.float32:
1884      input_tensor = math_ops.cast(input_tensor, dtypes.float32)
1885    return _reshape_real_valued_tensor(input_tensor, output_rank, self.name)
1886
1887  def _to_dense_tensor(self, input_tensor):
1888    return input_tensor
1889
1890  @property
1891  def _variable_shape(self):
1892    return tensor_shape.TensorShape([self.dimension])
1893
1894  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
1895    del weight_collections
1896    del trainable
1897    return inputs.get(self)
1898
1899  def _transform_feature(self, inputs):
1900    return math_ops.cast(
1901        self._normalized_input_tensor(inputs.get(self.name)), dtypes.float32)
1902
1903  @property
1904  def _parse_example_spec(self):
1905    return self.config
1906
1907
1908def real_valued_column(column_name,
1909                       dimension=1,
1910                       default_value=None,
1911                       dtype=dtypes.float32,
1912                       normalizer=None):
1913  """Creates a `_RealValuedColumn` for dense numeric data.
1914
1915  Args:
1916    column_name: A string defining real valued column name.
1917    dimension: An integer specifying dimension of the real valued column.
1918      The default is 1.
1919    default_value: A single value compatible with dtype or a list of values
1920      compatible with dtype which the column takes on during tf.Example parsing
1921      if data is missing. When dimension is not None, a default value of None
1922      will cause tf.parse_example to fail if an example does not contain this
1923      column. If a single value is provided, the same value will be applied as
1924      the default value for every dimension. If a list of values is provided,
1925      the length of the list should be equal to the value of `dimension`.
1926      Only scalar default value is supported in case dimension is not specified.
1927    dtype: defines the type of values. Default value is tf.float32. Must be a
1928      non-quantized, real integer or floating point type.
1929    normalizer: If not None, a function that can be used to normalize the value
1930      of the real valued column after default_value is applied for parsing.
1931      Normalizer function takes the input tensor as its argument, and returns
1932      the output tensor. (e.g. lambda x: (x - 3.0) / 4.2). Note that for
1933      variable length columns, the normalizer should expect an input_tensor of
1934      type `SparseTensor`.
1935  Returns:
1936    A _RealValuedColumn.
1937  Raises:
1938    TypeError: if dimension is not an int
1939    ValueError: if dimension is not a positive integer
1940    TypeError: if default_value is a list but its length is not equal to the
1941      value of `dimension`.
1942    TypeError: if default_value is not compatible with dtype.
1943    ValueError: if dtype is not convertible to tf.float32.
1944  """
1945
1946  if dimension is None:
1947    raise TypeError("dimension must be an integer. Use the "
1948                    "_real_valued_var_len_column for variable length features."
1949                    "dimension: {}, column_name: {}".format(dimension,
1950                                                            column_name))
1951  if not isinstance(dimension, int):
1952    raise TypeError("dimension must be an integer. "
1953                    "dimension: {}, column_name: {}".format(dimension,
1954                                                            column_name))
1955  if dimension < 1:
1956    raise ValueError("dimension must be greater than 0. "
1957                     "dimension: {}, column_name: {}".format(dimension,
1958                                                             column_name))
1959
1960  if not (dtype.is_integer or dtype.is_floating):
1961    raise ValueError("dtype must be convertible to float. "
1962                     "dtype: {}, column_name: {}".format(dtype, column_name))
1963
1964  if default_value is None:
1965    return _RealValuedColumn(column_name, dimension, default_value, dtype,
1966                             normalizer)
1967
1968  if isinstance(default_value, int):
1969    if dtype.is_integer:
1970      default_value = ([default_value for _ in range(dimension)] if dimension
1971                       else [default_value])
1972      return _RealValuedColumn(column_name, dimension, default_value, dtype,
1973                               normalizer)
1974    if dtype.is_floating:
1975      default_value = float(default_value)
1976      default_value = ([default_value for _ in range(dimension)] if dimension
1977                       else [default_value])
1978      return _RealValuedColumn(column_name, dimension, default_value, dtype,
1979                               normalizer)
1980
1981  if isinstance(default_value, float):
1982    if dtype.is_floating and (not dtype.is_integer):
1983      default_value = ([default_value for _ in range(dimension)] if dimension
1984                       else [default_value])
1985      return _RealValuedColumn(column_name, dimension, default_value, dtype,
1986                               normalizer)
1987
1988  if isinstance(default_value, list):
1989    if len(default_value) != dimension:
1990      raise ValueError(
1991          "The length of default_value must be equal to dimension. "
1992          "default_value: {}, dimension: {}, column_name: {}".format(
1993              default_value, dimension, column_name))
1994    # Check if the values in the list are all integers or are convertible to
1995    # floats.
1996    is_list_all_int = True
1997    is_list_all_float = True
1998    for v in default_value:
1999      if not isinstance(v, int):
2000        is_list_all_int = False
2001      if not (isinstance(v, float) or isinstance(v, int)):
2002        is_list_all_float = False
2003    if is_list_all_int:
2004      if dtype.is_integer:
2005        return _RealValuedColumn(column_name, dimension, default_value, dtype,
2006                                 normalizer)
2007      elif dtype.is_floating:
2008        default_value = [float(v) for v in default_value]
2009        return _RealValuedColumn(column_name, dimension, default_value, dtype,
2010                                 normalizer)
2011    if is_list_all_float:
2012      if dtype.is_floating and (not dtype.is_integer):
2013        default_value = [float(v) for v in default_value]
2014        return _RealValuedColumn(column_name, dimension, default_value, dtype,
2015                                 normalizer)
2016
2017  raise TypeError("default_value must be compatible with dtype. "
2018                  "default_value: {}, dtype: {}, column_name: {}".format(
2019                      default_value, dtype, column_name))
2020
2021
2022class _BucketizedColumn(
2023    _FeatureColumn,
2024    fc_core._CategoricalColumn,  # pylint: disable=protected-access
2025    fc_core._DenseColumn,  # pylint: disable=protected-access
2026    collections.namedtuple("_BucketizedColumn", ["source_column",
2027                                                 "boundaries"])):
2028  """Represents a bucketization transformation also known as binning.
2029
2030  Instances of this class are immutable. Values in `source_column` will be
2031  bucketized based on `boundaries`.
2032  For example, if the inputs are:
2033      boundaries = [0, 10, 100]
2034      source_column = [[-5], [150], [10], [0], [4], [19]]
2035
2036  then the bucketized feature will be:
2037      output = [[0], [3], [2], [1], [1], [2]]
2038
2039  Attributes:
2040    source_column: A _RealValuedColumn defining dense column.
2041    boundaries: A list or tuple of floats specifying the boundaries. It has to
2042      be sorted. [a, b, c] defines following buckets: (-inf., a), [a, b),
2043      [b, c), [c, inf.)
2044  Raises:
2045    ValueError: if 'boundaries' is empty or not sorted.
2046  """
2047
2048  def __new__(cls, source_column, boundaries):
2049    if not isinstance(source_column, _RealValuedColumn):
2050      raise TypeError("source_column must be an instance of _RealValuedColumn. "
2051                      "source_column: {}".format(source_column))
2052
2053    if source_column.dimension is None:
2054      raise ValueError("source_column must have a defined dimension. "
2055                       "source_column: {}".format(source_column))
2056
2057    if (not isinstance(boundaries, list) and
2058        not isinstance(boundaries, tuple)) or not boundaries:
2059      raise ValueError("boundaries must be a non-empty list or tuple. "
2060                       "boundaries: {}".format(boundaries))
2061
2062    # We allow bucket boundaries to be monotonically increasing
2063    # (ie a[i+1] >= a[i]). When two bucket boundaries are the same, we
2064    # de-duplicate.
2065    sanitized_boundaries = []
2066    for i in range(len(boundaries) - 1):
2067      if boundaries[i] == boundaries[i + 1]:
2068        continue
2069      elif boundaries[i] < boundaries[i + 1]:
2070        sanitized_boundaries.append(boundaries[i])
2071      else:
2072        raise ValueError("boundaries must be a sorted list. "
2073                         "boundaries: {}".format(boundaries))
2074    sanitized_boundaries.append(boundaries[len(boundaries) - 1])
2075
2076    return super(_BucketizedColumn, cls).__new__(cls, source_column,
2077                                                 tuple(sanitized_boundaries))
2078
2079  @property
2080  def name(self):
2081    return "{}_bucketized".format(self.source_column.name)
2082
2083  @property
2084  def length(self):
2085    """Returns total number of buckets."""
2086    return len(self.boundaries) + 1
2087
2088  @property
2089  def config(self):
2090    return self.source_column.config
2091
2092  @property
2093  def key(self):
2094    """Returns a string which will be used as a key when we do sorting."""
2095    return "{}".format(self)
2096
2097  # pylint: disable=unused-argument
2098  def _to_dnn_input_layer(self,
2099                          input_tensor,
2100                          weight_collections=None,
2101                          trainable=True,
2102                          output_rank=2):
2103    if output_rank != 2:
2104      raise ValueError("BucketizedColumn currently only supports output_rank=2")
2105    return array_ops.reshape(
2106        array_ops.one_hot(
2107            math_ops.cast(input_tensor, dtypes.int64),
2108            self.length,
2109            1.,
2110            0.,
2111            name="one_hot"), [-1, self.length * self.source_column.dimension],
2112        name="reshape")
2113
2114  def to_sparse_tensor(self, input_tensor):
2115    """Creates a SparseTensor from the bucketized Tensor."""
2116    dimension = self.source_column.dimension
2117    batch_size = array_ops.shape(input_tensor, name="shape")[0]
2118
2119    if dimension > 1:
2120      i1 = array_ops.reshape(
2121          array_ops.tile(
2122              array_ops.expand_dims(
2123                  math_ops.range(0, batch_size), 1, name="expand_dims"),
2124              [1, dimension],
2125              name="tile"), [-1],
2126          name="reshape")
2127      i2 = array_ops.tile(
2128          math_ops.range(0, dimension), [batch_size], name="tile")
2129      # Flatten the bucket indices and unique them across dimensions
2130      # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
2131      bucket_indices = array_ops.reshape(
2132          input_tensor, [-1], name="reshape") + self.length * i2
2133    else:
2134      # Simpler indices when dimension=1
2135      i1 = math_ops.range(0, batch_size)
2136      i2 = array_ops.zeros([batch_size], dtype=dtypes.int32, name="zeros")
2137      bucket_indices = array_ops.reshape(input_tensor, [-1], name="reshape")
2138
2139    indices = math_ops.cast(array_ops.transpose(array_ops.stack((i1, i2))),
2140                            dtypes.int64)
2141    shape = math_ops.cast(array_ops.stack([batch_size, dimension]),
2142                          dtypes.int64)
2143    sparse_id_values = sparse_tensor_py.SparseTensor(
2144        indices, bucket_indices, shape)
2145
2146    return sparse_id_values
2147
2148  def _wide_embedding_lookup_arguments(self, input_tensor):
2149    return _LinearEmbeddingLookupArguments(
2150        input_tensor=self.to_sparse_tensor(input_tensor),
2151        weight_tensor=None,
2152        vocab_size=self.length * self.source_column.dimension,
2153        initializer=init_ops.zeros_initializer(),
2154        combiner="sum")
2155
2156  def _transform_feature(self, inputs):
2157    """Handles cross transformation."""
2158    # Bucketize the source column.
2159    return bucketization_op.bucketize(
2160        inputs.get(self.source_column),
2161        boundaries=list(self.boundaries),
2162        name="bucketize")
2163
2164  def insert_transformed_feature(self, columns_to_tensors):
2165    """Handles sparse column to id conversion."""
2166    columns_to_tensors[self] = self._transform_feature(
2167        _LazyBuilderByColumnsToTensor(columns_to_tensors))
2168
2169  @property
2170  def _parse_example_spec(self):
2171    return self.config
2172
2173  @property
2174  def _num_buckets(self):
2175    return self.length * self.source_column.dimension
2176
2177  def _get_sparse_tensors(self, inputs, weight_collections=None,
2178                          trainable=None):
2179    del weight_collections
2180    del trainable
2181    return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
2182        self.to_sparse_tensor(inputs.get(self)), None)
2183
2184  @property
2185  def _variable_shape(self):
2186    return tensor_shape.TensorShape(
2187        [self.length * self.source_column.dimension])
2188
2189  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2190    return self._to_dnn_input_layer(
2191        inputs.get(self), weight_collections, trainable)
2192
2193
2194def bucketized_column(source_column, boundaries):
2195  """Creates a _BucketizedColumn for discretizing dense input.
2196
2197  Args:
2198    source_column: A _RealValuedColumn defining dense column.
2199    boundaries: A list or tuple of floats specifying the boundaries. It has to
2200      be sorted.
2201
2202  Returns:
2203    A _BucketizedColumn.
2204
2205  Raises:
2206    ValueError: if 'boundaries' is empty or not sorted.
2207  """
2208  return _BucketizedColumn(source_column, boundaries)
2209
2210
2211class _CrossedColumn(
2212    _FeatureColumn,
2213    fc_core._CategoricalColumn,  # pylint: disable=protected-access
2214    collections.namedtuple("_CrossedColumn", [
2215        "columns", "hash_bucket_size", "hash_key", "combiner",
2216        "ckpt_to_load_from", "tensor_name_in_ckpt"
2217    ])):
2218  """Represents a cross transformation also known as conjunction or combination.
2219
2220  Instances of this class are immutable. It crosses given `columns`. Crossed
2221  column output will be hashed to hash_bucket_size.
2222  Conceptually, transformation can be thought as:
2223    Hash(cartesian product of features in columns) % `hash_bucket_size`
2224
2225  For example, if the columns are
2226
2227      SparseTensor referred by first column: shape = [2, 2]
2228      [0, 0]: "a"
2229      [1, 0]: "b"
2230      [1, 1]: "c"
2231
2232      SparseTensor referred by second column: : shape = [2, 1]
2233      [0, 0]: "d"
2234      [1, 0]: "e"
2235
2236  then crossed feature will look like:
2237
2238      shape = [2, 2]
2239      [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
2240      [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
2241      [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
2242
2243  Attributes:
2244    columns: An iterable of _FeatureColumn. Items can be an instance of
2245      _SparseColumn, _CrossedColumn, or _BucketizedColumn.
2246    hash_bucket_size: An int that is > 1. The number of buckets.
2247    combiner: A string specifying how to reduce if there are multiple entries
2248      in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
2249      "sum" the default. "sqrtn" often achieves good accuracy, in particular
2250      with bag-of-words columns. Each of this can be thought as example level
2251      normalizations on the column::
2252        * "sum": do not normalize
2253        * "mean": do l1 normalization
2254        * "sqrtn": do l2 normalization
2255      For more information: `tf.embedding_lookup_sparse`.
2256    ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
2257      to restore the column weights. Required if `tensor_name_in_ckpt` is not
2258      None.
2259    tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
2260      checkpoint from which to restore the column weights. Required if
2261      `ckpt_to_load_from` is not None.
2262
2263  Raises:
2264    TypeError: if all items in columns are not an instance of _SparseColumn,
2265      _CrossedColumn, or _BucketizedColumn.
2266    ValueError: if hash_bucket_size is not > 1 or len(columns) is not > 1. Also,
2267      if only one of `ckpt_to_load_from` and `tensor_name_in_ckpt` is specified.
2268  """
2269
2270  @staticmethod
2271  def _assert_is_crossable(column):
2272    if isinstance(column, (_SparseColumn, _CrossedColumn, _BucketizedColumn)):
2273      return
2274    raise TypeError("columns must be a set of _SparseColumn, "
2275                    "_CrossedColumn, or _BucketizedColumn instances. "
2276                    "(column {} is a {})".format(column,
2277                                                 column.__class__.__name__))
2278
2279  def __new__(cls,
2280              columns,
2281              hash_bucket_size,
2282              hash_key,
2283              combiner="sum",
2284              ckpt_to_load_from=None,
2285              tensor_name_in_ckpt=None):
2286    for column in columns:
2287      _CrossedColumn._assert_is_crossable(column)
2288
2289    if len(columns) < 2:
2290      raise ValueError("columns must contain at least 2 elements. "
2291                       "columns: {}".format(columns))
2292
2293    if hash_bucket_size < 2:
2294      raise ValueError("hash_bucket_size must be at least 2. "
2295                       "hash_bucket_size: {}".format(hash_bucket_size))
2296
2297    if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
2298      raise ValueError("Must specify both `ckpt_to_load_from` and "
2299                       "`tensor_name_in_ckpt` or none of them.")
2300
2301    sorted_columns = sorted(
2302        [column for column in columns], key=lambda column: column.name)
2303    return super(_CrossedColumn, cls).__new__(cls, tuple(sorted_columns),
2304                                              hash_bucket_size, hash_key,
2305                                              combiner,
2306                                              ckpt_to_load_from,
2307                                              tensor_name_in_ckpt)
2308
2309  @property
2310  def name(self):
2311    sorted_names = sorted([column.name for column in self.columns])
2312    return "_X_".join(sorted_names)
2313
2314  @property
2315  def config(self):
2316    config = {}
2317    for column in self.columns:
2318      config.update(_get_feature_config(column))
2319    return config
2320
2321  @property
2322  def length(self):
2323    """Returns total number of buckets."""
2324    return self.hash_bucket_size
2325
2326  @property
2327  def key(self):
2328    """Returns a string which will be used as a key when we do sorting."""
2329    return "{}".format(self)
2330
2331  def id_tensor(self, input_tensor):
2332    """Returns the id tensor from the given transformed input_tensor."""
2333    return input_tensor
2334
2335  def weight_tensor(self, input_tensor):
2336    """Returns the weight tensor from the given transformed input_tensor."""
2337    del input_tensor
2338    return None
2339
2340  def _to_dnn_input_layer(self,
2341                          input_tensor,
2342                          weight_collections=None,
2343                          trainable=True,
2344                          output_rank=2):
2345    del input_tensor
2346    del weight_collections
2347    del trainable
2348    del output_rank
2349    raise ValueError("CrossedColumn is not supported in DNN. "
2350                     "Please use embedding_column. column: {}".format(self))
2351
2352  def _checkpoint_path(self):
2353    if self.ckpt_to_load_from is not None:
2354      return self.ckpt_to_load_from, self.tensor_name_in_ckpt
2355    return None
2356
2357  def _wide_embedding_lookup_arguments(self, input_tensor):
2358    return _LinearEmbeddingLookupArguments(
2359        input_tensor=input_tensor,
2360        weight_tensor=None,
2361        vocab_size=self.length,
2362        initializer=init_ops.zeros_initializer(),
2363        combiner=self.combiner)
2364
2365  def _transform_feature(self, inputs):
2366    """Handles cross transformation."""
2367
2368    def _collect_leaf_level_columns(cross):
2369      """Collects base columns contained in the cross."""
2370      leaf_level_columns = []
2371      for c in cross.columns:
2372        if isinstance(c, _CrossedColumn):
2373          leaf_level_columns.extend(_collect_leaf_level_columns(c))
2374        else:
2375          leaf_level_columns.append(c)
2376      return leaf_level_columns
2377
2378    feature_tensors = []
2379    for c in _collect_leaf_level_columns(self):
2380      if isinstance(c, _SparseColumn):
2381        feature_tensors.append(inputs.get(c.name))
2382      else:
2383        if isinstance(c, _BucketizedColumn):
2384          feature_tensors.append(c.to_sparse_tensor(inputs.get(c)))
2385        else:
2386          feature_tensors.append(inputs.get(c))
2387    return sparse_feature_cross_op.sparse_feature_cross(
2388        feature_tensors,
2389        hashed_output=True,
2390        num_buckets=self.hash_bucket_size,
2391        hash_key=self.hash_key,
2392        name="cross")
2393
2394  def insert_transformed_feature(self, columns_to_tensors):
2395    """Handles sparse column to id conversion."""
2396    columns_to_tensors[self] = self._transform_feature(
2397        _LazyBuilderByColumnsToTensor(columns_to_tensors))
2398
2399  @property
2400  def _parse_example_spec(self):
2401    return self.config
2402
2403  @property
2404  def _num_buckets(self):
2405    return self.length
2406
2407  def _get_sparse_tensors(self, inputs, weight_collections=None,
2408                          trainable=None):
2409    del weight_collections
2410    del trainable
2411    return fc_core._CategoricalColumn.IdWeightPair(inputs.get(self), None)  # pylint: disable=protected-access
2412
2413
2414class _LazyBuilderByColumnsToTensor(object):
2415
2416  def __init__(self, columns_to_tensors):
2417    self._columns_to_tensors = columns_to_tensors
2418
2419  def get(self, key):
2420    """Gets the transformed feature column."""
2421    if key in self._columns_to_tensors:
2422      return self._columns_to_tensors[key]
2423    if isinstance(key, str):
2424      raise ValueError(
2425          "features dictionary doesn't contain key ({})".format(key))
2426    if not isinstance(key, _FeatureColumn):
2427      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
2428                      "Provided: {}".format(key))
2429
2430    key.insert_transformed_feature(self._columns_to_tensors)
2431    return self._columns_to_tensors[key]
2432
2433
2434def crossed_column(columns, hash_bucket_size, combiner="sum",
2435                   ckpt_to_load_from=None,
2436                   tensor_name_in_ckpt=None,
2437                   hash_key=None):
2438  """Creates a _CrossedColumn for performing feature crosses.
2439
2440  Args:
2441    columns: An iterable of _FeatureColumn. Items can be an instance of
2442      _SparseColumn, _CrossedColumn, or _BucketizedColumn.
2443    hash_bucket_size: An int that is > 1. The number of buckets.
2444    combiner: A string specifying how to reduce if there are multiple entries
2445      in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
2446      "sum" the default. "sqrtn" often achieves good accuracy, in particular
2447      with bag-of-words columns. Each of this can be thought as example level
2448      normalizations on the column::
2449        * "sum": do not normalize
2450        * "mean": do l1 normalization
2451        * "sqrtn": do l2 normalization
2452      For more information: `tf.embedding_lookup_sparse`.
2453    ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
2454      to restore the column weights. Required if `tensor_name_in_ckpt` is not
2455      None.
2456    tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
2457      checkpoint from which to restore the column weights. Required if
2458      `ckpt_to_load_from` is not None.
2459    hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
2460      function to combine the crosses fingerprints on SparseFeatureCrossOp
2461      (optional).
2462
2463  Returns:
2464    A _CrossedColumn.
2465
2466  Raises:
2467    TypeError: if any item in columns is not an instance of _SparseColumn,
2468      _CrossedColumn, or _BucketizedColumn, or
2469      hash_bucket_size is not an int.
2470    ValueError: if hash_bucket_size is not > 1 or
2471      len(columns) is not > 1.
2472  """
2473  return _CrossedColumn(
2474      columns,
2475      hash_bucket_size,
2476      hash_key,
2477      combiner=combiner,
2478      ckpt_to_load_from=ckpt_to_load_from,
2479      tensor_name_in_ckpt=tensor_name_in_ckpt)
2480
2481
2482class DataFrameColumn(_FeatureColumn,
2483                      collections.namedtuple("DataFrameColumn",
2484                                             ["column_name", "series"])):
2485  """Represents a feature column produced from a `DataFrame`.
2486
2487  Instances of this class are immutable.  A `DataFrame` column may be dense or
2488  sparse, and may have any shape, with the constraint that dimension 0 is
2489  batch_size.
2490
2491  Args:
2492    column_name: a name for this column
2493    series: a `Series` to be wrapped, which has already had its base features
2494      substituted with `PredefinedSeries`.
2495  """
2496
2497  def __new__(cls, column_name, series):
2498    return super(DataFrameColumn, cls).__new__(cls, column_name, series)
2499
2500  @property
2501  def name(self):
2502    return self.column_name
2503
2504  @property
2505  def config(self):
2506    return self.series.required_base_features()
2507
2508  @property
2509  def key(self):
2510    """Returns a string which will be used as a key when we do sorting."""
2511    return self.name
2512
2513  def insert_transformed_feature(self, columns_to_tensors):
2514    # The cache must already contain mappings from the expected base feature
2515    # names to Tensors.
2516
2517    # Passing columns_to_tensors as the cache here means that multiple outputs
2518    # of the transform will be cached, keyed by the repr of their associated
2519    # TransformedSeries.
2520    # The specific requested output ends up in columns_to_tensors twice: once
2521    # keyed by the TransformedSeries repr, and once keyed by this
2522    # DataFrameColumn instance.
2523    columns_to_tensors[self] = self.series.build(columns_to_tensors)
2524
2525  # pylint: disable=unused-argument
2526  def _to_dnn_input_layer(self,
2527                          input_tensor,
2528                          weight_collections=None,
2529                          trainable=True,
2530                          output_rank=2):
2531    if input_tensor.dtype != dtypes.float32:
2532      input_tensor = math_ops.cast(input_tensor, dtypes.float32)
2533    return _reshape_real_valued_tensor(input_tensor, output_rank, self.name)
2534
2535  def _to_dense_tensor(self, input_tensor):
2536    return self._to_dnn_input_layer(input_tensor)
2537
2538  def __eq__(self, other):
2539    if isinstance(other, self.__class__):
2540      return self.__dict__ == other.__dict__
2541    else:
2542      return False
2543
2544  def __ne__(self, other):
2545    return not self.__eq__(other)
2546
2547
2548def _get_feature_config(feature_column):
2549  """Returns configuration for the base feature defined in feature_column."""
2550  if not isinstance(feature_column, _FeatureColumn):
2551    raise TypeError(
2552        "feature_columns should only contain instances of _FeatureColumn. "
2553        "Given column is {}".format(feature_column))
2554  if isinstance(feature_column, (_SparseColumn, _WeightedSparseColumn,
2555                                 _EmbeddingColumn, _RealValuedColumn,
2556                                 _RealValuedVarLenColumn,
2557                                 _BucketizedColumn, _CrossedColumn,
2558                                 _OneHotColumn, _ScatteredEmbeddingColumn)):
2559    return feature_column.config
2560
2561  raise TypeError("Not supported _FeatureColumn type. "
2562                  "Given column is {}".format(feature_column))
2563
2564
2565def create_feature_spec_for_parsing(feature_columns):
2566  """Helper that prepares features config from input feature_columns.
2567
2568  The returned feature config can be used as arg 'features' in tf.parse_example.
2569
2570  Typical usage example:
2571
2572  ```python
2573  # Define features and transformations
2574  feature_a = sparse_column_with_vocabulary_file(...)
2575  feature_b = real_valued_column(...)
2576  feature_c_bucketized = bucketized_column(real_valued_column("feature_c"), ...)
2577  feature_a_x_feature_c = crossed_column(
2578    columns=[feature_a, feature_c_bucketized], ...)
2579
2580  feature_columns = set(
2581    [feature_b, feature_c_bucketized, feature_a_x_feature_c])
2582  batch_examples = tf.parse_example(
2583      serialized=serialized_examples,
2584      features=create_feature_spec_for_parsing(feature_columns))
2585  ```
2586
2587  For the above example, create_feature_spec_for_parsing would return the dict:
2588  {
2589    "feature_a": parsing_ops.VarLenFeature(tf.string),
2590    "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
2591    "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
2592  }
2593
2594  Args:
2595    feature_columns: An iterable containing all the feature columns. All items
2596      should be instances of classes derived from _FeatureColumn, unless
2597      feature_columns is a dict -- in which case, this should be true of all
2598      values in the dict.
2599  Returns:
2600    A dict mapping feature keys to FixedLenFeature or VarLenFeature values.
2601  """
2602  if isinstance(feature_columns, dict):
2603    feature_columns = feature_columns.values()
2604
2605  features_config = {}
2606  for column in feature_columns:
2607    features_config.update(_get_feature_config(column))
2608  return features_config
2609
2610
2611def _create_sequence_feature_spec_for_parsing(sequence_feature_columns,
2612                                              allow_missing_by_default=False):
2613  """Prepares a feature spec for parsing `tf.SequenceExample`s.
2614
2615  Args:
2616    sequence_feature_columns: an iterable containing all the feature columns.
2617      All items should be instances of classes derived from `_FeatureColumn`.
2618    allow_missing_by_default: whether to set `allow_missing=True` by default for
2619      `FixedLenSequenceFeature`s.
2620  Returns:
2621    A dict mapping feature keys to `FixedLenSequenceFeature` or `VarLenFeature`.
2622  """
2623  feature_spec = create_feature_spec_for_parsing(sequence_feature_columns)
2624  sequence_feature_spec = {}
2625  for key, feature in feature_spec.items():
2626    if isinstance(feature, parsing_ops.VarLenFeature):
2627      sequence_feature = feature
2628    elif (isinstance(feature, parsing_ops.FixedLenFeature) or
2629          isinstance(feature, parsing_ops.FixedLenSequenceFeature)):
2630      default_is_set = feature.default_value is not None
2631      if default_is_set:
2632        logging.warning(
2633            'Found default value {} for feature "{}". Ignoring this value and '
2634            'setting `allow_missing=True` instead.'.
2635            format(feature.default_value, key))
2636      sequence_feature = parsing_ops.FixedLenSequenceFeature(
2637          shape=feature.shape,
2638          dtype=feature.dtype,
2639          allow_missing=(allow_missing_by_default or default_is_set))
2640    else:
2641      raise TypeError(
2642          "Unsupported feature type: {}".format(type(feature).__name__))
2643    sequence_feature_spec[key] = sequence_feature
2644  return sequence_feature_spec
2645
2646
2647def make_place_holder_tensors_for_base_features(feature_columns):
2648  """Returns placeholder tensors for inference.
2649
2650  Args:
2651    feature_columns: An iterable containing all the feature columns. All items
2652      should be instances of classes derived from _FeatureColumn.
2653  Returns:
2654    A dict mapping feature keys to SparseTensors (sparse columns) or
2655    placeholder Tensors (dense columns).
2656  """
2657  # Get dict mapping features to FixedLenFeature or VarLenFeature values.
2658  dict_for_parse_example = create_feature_spec_for_parsing(feature_columns)
2659  placeholders = {}
2660  for column_name, column_type in dict_for_parse_example.items():
2661    if isinstance(column_type, parsing_ops.VarLenFeature):
2662      # Sparse placeholder for sparse tensors.
2663      placeholders[column_name] = array_ops.sparse_placeholder(
2664          column_type.dtype, name="Placeholder_{}".format(column_name))
2665    else:
2666      # Simple placeholder for dense tensors.
2667      placeholders[column_name] = array_ops.placeholder(
2668          column_type.dtype,
2669          shape=(None, column_type.shape[0]),
2670          name="Placeholder_{}".format(column_name))
2671  return placeholders
2672
2673
2674class _SparseIdLookupConfig(
2675    collections.namedtuple("_SparseIdLookupConfig",
2676                           ["vocabulary_file", "keys", "num_oov_buckets",
2677                            "vocab_size", "default_value"])):
2678  """Defines lookup configuration for a sparse feature.
2679
2680  An immutable object defines lookup table configuration used by
2681  tf.feature_to_id_v2.
2682
2683  Attributes:
2684    vocabulary_file: The vocabulary filename. vocabulary_file cannot be combined
2685      with keys.
2686    keys: A 1-D string iterable that specifies the mapping of strings to
2687      indices. It means a feature in keys will map to it's index in keys.
2688    num_oov_buckets: The number of out-of-vocabulary buckets. If zero all out of
2689      vocabulary features will be ignored.
2690    vocab_size: Number of the elements in the vocabulary.
2691    default_value: The value to use for out-of-vocabulary feature values.
2692      Defaults to -1.
2693  """
2694
2695  def __new__(cls,
2696              vocabulary_file=None,
2697              keys=None,
2698              num_oov_buckets=0,
2699              vocab_size=None,
2700              default_value=-1):
2701
2702    return super(_SparseIdLookupConfig, cls).__new__(cls, vocabulary_file, keys,
2703                                                     num_oov_buckets,
2704                                                     vocab_size, default_value)
2705