1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Utilities related to FeatureColumn."""
16
17from __future__ import absolute_import
18from __future__ import division
19from __future__ import print_function
20
21import functools
22
23from tensorflow.contrib.framework.python.framework import experimental
24from tensorflow.contrib.framework.python.ops import variables as contrib_variables
25from tensorflow.contrib.layers.python.layers import embedding_ops
26from tensorflow.contrib.layers.python.layers import feature_column as fc
27from tensorflow.contrib.layers.python.layers import layers
28from tensorflow.python.framework import dtypes
29from tensorflow.python.framework import ops
30from tensorflow.python.framework import sparse_tensor as sparse_tensor_py
31from tensorflow.python.ops import array_ops
32from tensorflow.python.ops import init_ops
33from tensorflow.python.ops import math_ops
34from tensorflow.python.ops import nn_ops
35from tensorflow.python.ops import parsing_ops
36from tensorflow.python.ops import sparse_ops
37from tensorflow.python.ops import variable_scope
38from tensorflow.python.platform import tf_logging as logging
39from tensorflow.python.util import nest
40
41
42def _maybe_reshape_input_tensor(tensor, column_name, output_rank):
43  """Reshape the input tensor by the following rule.
44
45  1. If `output_rank > input_rank + 1`, raise a `ValueError`.
46  2. If `output_rank == input_rank + 1`, expand the tensor by one dimension.
47  3. If `output_rank == input_rank`, do nothing.
48  4. If `output_rank < input_rank`, flatten the inner dimensions of the tensor.
49
50  Args:
51    tensor: A Tensor or SparseTensor to be reshaped.
52    column_name: A string name of the feature column for the tensor.
53    output_rank: the desired rank of the tensor.
54  Returns:
55    A reshaped Tensor or SparseTensor.
56  Raises:
57    ValueError: if `output_rank > input_rank + 1` for the input tensor.
58  """
59  input_rank = tensor.get_shape().ndims
60
61  if input_rank is None and isinstance(tensor, sparse_tensor_py.SparseTensor):
62    # Try to get the rank of a sparse tensor by its dense_shape's shape.
63    input_rank = tensor.dense_shape.get_shape().as_list()[0]
64
65  if input_rank is None:
66    raise ValueError('Error while processing column {}. Rank of input Tensor '
67                     'can not be None.'.format(column_name))
68
69  if output_rank > input_rank + 1:
70    raise ValueError('Error while processing column {}. Rank of input Tensor '
71                     '({}) should be the same as output_rank ({}). For '
72                     'example, sequence data should typically be 3 '
73                     'dimensional (rank 3) while non-sequence data is '
74                     'typically 2 dimensional (rank 2).'.format(
75                         column_name, input_rank, output_rank))
76  elif output_rank == input_rank + 1:
77    # Expand the tensor's shape by 1 dimension.
78    if isinstance(tensor, sparse_tensor_py.SparseTensor):
79      output_shape = array_ops.concat([tensor.dense_shape, [1]], 0)
80      return sparse_ops.sparse_reshape(tensor, output_shape)
81    else:
82      reshaped = array_ops.expand_dims(tensor, -1)
83      # Try to calculate the new shape.
84      static_shape = tensor.get_shape()
85      if static_shape is not None and static_shape.dims is not None:
86        reshaped.set_shape(static_shape.as_list() + [1])
87      return reshaped
88  elif output_rank < input_rank:
89    return layers._inner_flatten(tensor, output_rank)  # pylint: disable=protected-access
90  else:
91    return tensor
92
93
94def _input_from_feature_columns(columns_to_tensors,
95                                feature_columns,
96                                weight_collections,
97                                trainable,
98                                scope,
99                                output_rank,
100                                default_name,
101                                cols_to_outs=None):
102  """Implementation of `input_from(_sequence)_feature_columns`."""
103  columns_to_tensors = columns_to_tensors.copy()
104  check_feature_columns(feature_columns)
105  if cols_to_outs is not None and not isinstance(cols_to_outs, dict):
106    raise ValueError('cols_to_outs must be a dict unless None')
107  with variable_scope.variable_scope(scope,
108                                     default_name=default_name,
109                                     values=columns_to_tensors.values()):
110    output_tensors = []
111    transformer = _Transformer(columns_to_tensors)
112    if weight_collections:
113      weight_collections = list(set(list(weight_collections) +
114                                    [ops.GraphKeys.GLOBAL_VARIABLES]))
115
116    for column in sorted(set(feature_columns), key=lambda x: x.key):
117      with variable_scope.variable_scope(None,
118                                         default_name=column.name,
119                                         values=columns_to_tensors.values()):
120        transformed_tensor = transformer.transform(column)
121        if output_rank == 3:
122          transformed_tensor = nest.map_structure(
123              functools.partial(
124                  _maybe_reshape_input_tensor,
125                  column_name=column.name,
126                  output_rank=output_rank), transformed_tensor)
127        try:
128          # pylint: disable=protected-access
129          arguments = column._deep_embedding_lookup_arguments(
130              transformed_tensor)
131          output_tensors.append(
132              fc._embeddings_from_arguments(  # pylint: disable=protected-access
133                  column,
134                  arguments,
135                  weight_collections,
136                  trainable,
137                  output_rank=output_rank))
138
139        except NotImplementedError as ee:
140          try:
141            # pylint: disable=protected-access
142            output_tensors.append(column._to_dnn_input_layer(
143                transformed_tensor,
144                weight_collections,
145                trainable,
146                output_rank=output_rank))
147          except ValueError as e:
148            raise ValueError('Error creating input layer for column: {}.\n'
149                             '{}, {}'.format(column.name, e, ee))
150        if cols_to_outs is not None:
151          cols_to_outs[column] = output_tensors[-1]
152    return array_ops.concat(output_tensors, output_rank - 1)
153
154
155def input_from_feature_columns(columns_to_tensors,
156                               feature_columns,
157                               weight_collections=None,
158                               trainable=True,
159                               scope=None,
160                               cols_to_outs=None):
161  """A tf.contrib.layers style input layer builder based on FeatureColumns.
162
163  Generally a single example in training data is described with feature columns.
164  At the first layer of the model, this column oriented data should be converted
165  to a single tensor. Each feature column needs a different kind of operation
166  during this conversion. For example sparse features need a totally different
167  handling than continuous features.
168
169  Example:
170
171  ```python
172    # Building model for training
173    columns_to_tensor = tf.parse_example(...)
174    first_layer = input_from_feature_columns(
175        columns_to_tensors=columns_to_tensor,
176        feature_columns=feature_columns)
177    second_layer = fully_connected(inputs=first_layer, ...)
178    ...
179  ```
180
181  where feature_columns can be defined as follows:
182
183  ```python
184    sparse_feature = sparse_column_with_hash_bucket(
185        column_name="sparse_col", ...)
186    sparse_feature_emb = embedding_column(sparse_id_column=sparse_feature, ...)
187    real_valued_feature = real_valued_column(...)
188    real_valued_buckets = bucketized_column(
189        source_column=real_valued_feature, ...)
190
191    feature_columns=[sparse_feature_emb, real_valued_buckets]
192  ```
193
194  Args:
195    columns_to_tensors: A mapping from feature column to tensors. 'string' key
196      means a base feature (not-transformed). It can have FeatureColumn as a
197      key too. That means that FeatureColumn is already transformed by input
198      pipeline.
199    feature_columns: A set containing all the feature columns. All items in the
200      set should be instances of classes derived by FeatureColumn.
201    weight_collections: List of graph collections to which weights are added.
202    trainable: If `True` also add variables to the graph collection
203      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
204    scope: Optional scope for variable_scope.
205    cols_to_outs: Optional dict from feature column to output tensor,
206      which is concatenated into the returned tensor.
207
208  Returns:
209    A Tensor which can be consumed by hidden layers in the neural network.
210
211  Raises:
212    ValueError: if FeatureColumn cannot be consumed by a neural network.
213  """
214  return _input_from_feature_columns(columns_to_tensors,
215                                     feature_columns,
216                                     weight_collections,
217                                     trainable,
218                                     scope,
219                                     output_rank=2,
220                                     default_name='input_from_feature_columns',
221                                     cols_to_outs=cols_to_outs)
222
223
224@experimental
225def sequence_input_from_feature_columns(columns_to_tensors,
226                                        feature_columns,
227                                        weight_collections=None,
228                                        trainable=True,
229                                        scope=None):
230  """Builds inputs for sequence models from `FeatureColumn`s.
231
232  See documentation for `input_from_feature_columns`. The following types of
233  `FeatureColumn` are permitted in `feature_columns`: `_OneHotColumn`,
234  `_EmbeddingColumn`, `_ScatteredEmbeddingColumn`, `_RealValuedColumn`,
235  `_DataFrameColumn`. In addition, columns in `feature_columns` may not be
236  constructed using any of the following: `ScatteredEmbeddingColumn`,
237  `BucketizedColumn`, `CrossedColumn`.
238
239  Args:
240    columns_to_tensors: A mapping from feature column to tensors. 'string' key
241      means a base feature (not-transformed). It can have FeatureColumn as a
242      key too. That means that FeatureColumn is already transformed by input
243      pipeline.
244    feature_columns: A set containing all the feature columns. All items in the
245      set should be instances of classes derived by FeatureColumn.
246    weight_collections: List of graph collections to which weights are added.
247    trainable: If `True` also add variables to the graph collection
248      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
249    scope: Optional scope for variable_scope.
250
251  Returns:
252    A Tensor which can be consumed by hidden layers in the neural network.
253
254  Raises:
255    ValueError: if FeatureColumn cannot be consumed by a neural network.
256  """
257  _check_supported_sequence_columns(feature_columns)
258  _check_forbidden_sequence_columns(feature_columns)
259
260  return _input_from_feature_columns(
261      columns_to_tensors,
262      feature_columns,
263      weight_collections,
264      trainable,
265      scope,
266      output_rank=3,
267      default_name='sequence_input_from_feature_columns')
268
269
270def _create_embedding_lookup(column,
271                             columns_to_tensors,
272                             embedding_lookup_arguments,
273                             num_outputs,
274                             trainable,
275                             weight_collections):
276  """Creates variables and returns predictions for linear weights in a model.
277
278  Args:
279   column: the column we're working on.
280   columns_to_tensors: a map from column name to tensors.
281   embedding_lookup_arguments: arguments for embedding lookup.
282   num_outputs: how many outputs.
283   trainable: whether the variable we create is trainable.
284   weight_collections: weights will be placed here.
285
286  Returns:
287  variables: the created embeddings.
288  predictions: the computed predictions.
289  """
290  with variable_scope.variable_scope(
291      None, default_name=column.name, values=columns_to_tensors.values()):
292    variable = contrib_variables.model_variable(
293        name='weights',
294        shape=[embedding_lookup_arguments.vocab_size, num_outputs],
295        dtype=dtypes.float32,
296        initializer=embedding_lookup_arguments.initializer,
297        trainable=trainable,
298        collections=weight_collections)
299    if fc._is_variable(variable):  # pylint: disable=protected-access
300      variable = [variable]
301    else:
302      variable = variable._get_variable_list()  # pylint: disable=protected-access
303    predictions = embedding_ops.safe_embedding_lookup_sparse(
304        variable,
305        embedding_lookup_arguments.input_tensor,
306        sparse_weights=embedding_lookup_arguments.weight_tensor,
307        combiner=embedding_lookup_arguments.combiner,
308        name=column.name + '_weights')
309    return variable, predictions
310
311
312def _create_joint_embedding_lookup(columns_to_tensors,
313                                   embedding_lookup_arguments,
314                                   num_outputs,
315                                   trainable,
316                                   weight_collections):
317  """Creates an embedding lookup for all columns sharing a single weight."""
318  for arg in embedding_lookup_arguments:
319    assert arg.weight_tensor is None, (
320        'Joint sums for weighted sparse columns are not supported. '
321        'Please use weighted_sum_from_feature_columns instead.')
322    assert arg.combiner == 'sum', (
323        'Combiners other than sum are not supported for joint sums. '
324        'Please use weighted_sum_from_feature_columns instead.')
325  assert len(embedding_lookup_arguments) >= 1, (
326      'At least one column must be in the model.')
327  prev_size = 0
328  sparse_tensors = []
329  for a in embedding_lookup_arguments:
330    t = a.input_tensor
331    values = t.values + prev_size
332    prev_size += a.vocab_size
333    sparse_tensors.append(
334        sparse_tensor_py.SparseTensor(t.indices,
335                                      values,
336                                      t.dense_shape))
337  sparse_tensor = sparse_ops.sparse_concat(1, sparse_tensors)
338  with variable_scope.variable_scope(
339      None, default_name='linear_weights', values=columns_to_tensors.values()):
340    variable = contrib_variables.model_variable(
341        name='weights',
342        shape=[prev_size, num_outputs],
343        dtype=dtypes.float32,
344        initializer=init_ops.zeros_initializer(),
345        trainable=trainable,
346        collections=weight_collections)
347    if fc._is_variable(variable):  # pylint: disable=protected-access
348      variable = [variable]
349    else:
350      variable = variable._get_variable_list()  # pylint: disable=protected-access
351    predictions = embedding_ops.safe_embedding_lookup_sparse(
352        variable,
353        sparse_tensor,
354        sparse_weights=None,
355        combiner='sum',
356        name='_weights')
357    return variable, predictions
358
359
360def joint_weighted_sum_from_feature_columns(columns_to_tensors,
361                                            feature_columns,
362                                            num_outputs,
363                                            weight_collections=None,
364                                            trainable=True,
365                                            scope=None):
366  """A restricted linear prediction builder based on FeatureColumns.
367
368  As long as all feature columns are unweighted sparse columns this computes the
369  prediction of a linear model which stores all weights in a single variable.
370
371  Args:
372    columns_to_tensors: A mapping from feature column to tensors. 'string' key
373      means a base feature (not-transformed). It can have FeatureColumn as a
374      key too. That means that FeatureColumn is already transformed by input
375      pipeline. For example, `inflow` may have handled transformations.
376    feature_columns: A set containing all the feature columns. All items in the
377      set should be instances of classes derived from FeatureColumn.
378    num_outputs: An integer specifying number of outputs. Default value is 1.
379    weight_collections: List of graph collections to which weights are added.
380    trainable: If `True` also add variables to the graph collection
381      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
382    scope: Optional scope for variable_scope.
383
384  Returns:
385    A tuple containing:
386
387    * A Tensor which represents predictions of a linear model.
388    * A list of Variables storing the weights.
389    * A Variable which is used for bias.
390
391  Raises:
392    ValueError: if FeatureColumn cannot be used for linear predictions.
393
394  """
395  columns_to_tensors = columns_to_tensors.copy()
396  check_feature_columns(feature_columns)
397  with variable_scope.variable_scope(
398      scope,
399      default_name='joint_weighted_sum_from_feature_columns',
400      values=columns_to_tensors.values()):
401    transformer = _Transformer(columns_to_tensors)
402    embedding_lookup_arguments = []
403    for column in sorted(set(feature_columns), key=lambda x: x.key):
404      transformed_tensor = transformer.transform(column)
405      try:
406        embedding_lookup_arguments.append(
407            column._wide_embedding_lookup_arguments(transformed_tensor))   # pylint: disable=protected-access
408      except NotImplementedError:
409        raise NotImplementedError('Real-valued columns are not supported. '
410                                  'Use weighted_sum_from_feature_columns '
411                                  'instead, or bucketize these columns.')
412
413    variable, predictions_no_bias = _create_joint_embedding_lookup(
414        columns_to_tensors,
415        embedding_lookup_arguments,
416        num_outputs,
417        trainable,
418        weight_collections)
419    bias = contrib_variables.model_variable(
420        'bias_weight',
421        shape=[num_outputs],
422        initializer=init_ops.zeros_initializer(),
423        trainable=trainable,
424        collections=_add_variable_collection(weight_collections))
425    _log_variable(bias)
426    predictions = nn_ops.bias_add(predictions_no_bias, bias)
427
428    return predictions, variable, bias
429
430
431def weighted_sum_from_feature_columns(columns_to_tensors,
432                                      feature_columns,
433                                      num_outputs,
434                                      weight_collections=None,
435                                      trainable=True,
436                                      scope=None):
437  """A tf.contrib.layers style linear prediction builder based on FeatureColumn.
438
439  Generally a single example in training data is described with feature columns.
440  This function generates weighted sum for each num_outputs. Weighted sum refers
441  to logits in classification problems. It refers to prediction itself for
442  linear regression problems.
443
444  Example:
445
446    ```
447    # Building model for training
448    feature_columns = (
449        real_valued_column("my_feature1"),
450        ...
451    )
452    columns_to_tensor = tf.parse_example(...)
453    logits = weighted_sum_from_feature_columns(
454        columns_to_tensors=columns_to_tensor,
455        feature_columns=feature_columns,
456        num_outputs=1)
457    loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
458                                                   logits=logits)
459    ```
460
461  Args:
462    columns_to_tensors: A mapping from feature column to tensors. 'string' key
463      means a base feature (not-transformed). It can have FeatureColumn as a
464      key too. That means that FeatureColumn is already transformed by input
465      pipeline. For example, `inflow` may have handled transformations.
466    feature_columns: A set containing all the feature columns. All items in the
467      set should be instances of classes derived from FeatureColumn.
468    num_outputs: An integer specifying number of outputs. Default value is 1.
469    weight_collections: List of graph collections to which weights are added.
470    trainable: If `True` also add variables to the graph collection
471      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
472    scope: Optional scope for variable_scope.
473
474  Returns:
475    A tuple containing:
476
477      * A Tensor which represents predictions of a linear model.
478      * A dictionary which maps feature_column to corresponding Variable.
479      * A Variable which is used for bias.
480
481  Raises:
482    ValueError: if FeatureColumn cannot be used for linear predictions.
483  """
484  columns_to_tensors = columns_to_tensors.copy()
485  check_feature_columns(feature_columns)
486  with variable_scope.variable_scope(
487      scope,
488      default_name='weighted_sum_from_feature_columns',
489      values=columns_to_tensors.values()):
490    output_tensors = []
491    column_to_variable = dict()
492    transformer = _Transformer(columns_to_tensors)
493    # pylint: disable=protected-access
494    for column in sorted(set(feature_columns), key=lambda x: x.key):
495      transformed_tensor = transformer.transform(column)
496      try:
497        embedding_lookup_arguments = column._wide_embedding_lookup_arguments(
498            transformed_tensor)
499        variable, predictions = _create_embedding_lookup(
500            column,
501            columns_to_tensors,
502            embedding_lookup_arguments,
503            num_outputs,
504            trainable,
505            weight_collections)
506      except NotImplementedError:
507        with variable_scope.variable_scope(
508            None,
509            default_name=column.name,
510            values=columns_to_tensors.values()):
511          tensor = column._to_dense_tensor(transformed_tensor)
512          tensor = _maybe_reshape_input_tensor(
513              tensor, column.name, output_rank=2)
514          variable = [
515              contrib_variables.model_variable(
516                  name='weight',
517                  shape=[tensor.get_shape()[1], num_outputs],
518                  initializer=init_ops.zeros_initializer(),
519                  trainable=trainable,
520                  collections=weight_collections)
521          ]
522          predictions = math_ops.matmul(tensor, variable[0], name='matmul')
523      except ValueError as ee:
524        raise ValueError('Error creating weighted sum for column: {}.\n'
525                         '{}'.format(column.name, ee))
526      output_tensors.append(array_ops.reshape(
527          predictions, shape=(-1, num_outputs)))
528      column_to_variable[column] = variable
529      _log_variable(variable)
530      fc._maybe_restore_from_checkpoint(column._checkpoint_path(), variable)  # pylint: disable=protected-access
531    # pylint: enable=protected-access
532    predictions_no_bias = math_ops.add_n(output_tensors)
533    bias = contrib_variables.model_variable(
534        'bias_weight',
535        shape=[num_outputs],
536        initializer=init_ops.zeros_initializer(),
537        trainable=trainable,
538        collections=_add_variable_collection(weight_collections))
539    _log_variable(bias)
540    predictions = nn_ops.bias_add(predictions_no_bias, bias)
541
542    return predictions, column_to_variable, bias
543
544
545def parse_feature_columns_from_examples(serialized,
546                                        feature_columns,
547                                        name=None,
548                                        example_names=None):
549  """Parses tf.Examples to extract tensors for given feature_columns.
550
551  This is a wrapper of 'tf.parse_example'.
552
553  Example:
554
555  ```python
556  columns_to_tensor = parse_feature_columns_from_examples(
557      serialized=my_data,
558      feature_columns=my_features)
559
560  # Where my_features are:
561  # Define features and transformations
562  sparse_feature_a = sparse_column_with_keys(
563      column_name="sparse_feature_a", keys=["AB", "CD", ...])
564
565  embedding_feature_a = embedding_column(
566      sparse_id_column=sparse_feature_a, dimension=3, combiner="sum")
567
568  sparse_feature_b = sparse_column_with_hash_bucket(
569      column_name="sparse_feature_b", hash_bucket_size=1000)
570
571  embedding_feature_b = embedding_column(
572      sparse_id_column=sparse_feature_b, dimension=16, combiner="sum")
573
574  crossed_feature_a_x_b = crossed_column(
575      columns=[sparse_feature_a, sparse_feature_b], hash_bucket_size=10000)
576
577  real_feature = real_valued_column("real_feature")
578  real_feature_buckets = bucketized_column(
579      source_column=real_feature, boundaries=[...])
580
581  my_features = [embedding_feature_b, real_feature_buckets, embedding_feature_a]
582  ```
583
584  Args:
585    serialized: A vector (1-D Tensor) of strings, a batch of binary
586      serialized `Example` protos.
587    feature_columns: An iterable containing all the feature columns. All items
588      should be instances of classes derived from _FeatureColumn.
589    name: A name for this operation (optional).
590    example_names: A vector (1-D Tensor) of strings (optional), the names of
591      the serialized protos in the batch.
592
593  Returns:
594    A `dict` mapping FeatureColumn to `Tensor` and `SparseTensor` values.
595  """
596  check_feature_columns(feature_columns)
597  columns_to_tensors = parsing_ops.parse_example(
598      serialized=serialized,
599      features=fc.create_feature_spec_for_parsing(feature_columns),
600      name=name,
601      example_names=example_names)
602
603  transformer = _Transformer(columns_to_tensors)
604  for column in sorted(set(feature_columns), key=lambda x: x.key):
605    transformer.transform(column)
606  return columns_to_tensors
607
608
609def transform_features(features, feature_columns):
610  """Returns transformed features based on features columns passed in.
611
612  Example:
613
614  ```python
615  columns_to_tensor = transform_features(features=features,
616                                         feature_columns=feature_columns)
617
618  # Where my_features are:
619  # Define features and transformations
620  sparse_feature_a = sparse_column_with_keys(
621      column_name="sparse_feature_a", keys=["AB", "CD", ...])
622
623  embedding_feature_a = embedding_column(
624      sparse_id_column=sparse_feature_a, dimension=3, combiner="sum")
625
626  sparse_feature_b = sparse_column_with_hash_bucket(
627      column_name="sparse_feature_b", hash_bucket_size=1000)
628
629  embedding_feature_b = embedding_column(
630      sparse_id_column=sparse_feature_b, dimension=16, combiner="sum")
631
632  crossed_feature_a_x_b = crossed_column(
633      columns=[sparse_feature_a, sparse_feature_b], hash_bucket_size=10000)
634
635  real_feature = real_valued_column("real_feature")
636  real_feature_buckets = bucketized_column(
637      source_column=real_feature, boundaries=[...])
638
639  feature_columns = [embedding_feature_b,
640                     real_feature_buckets,
641                     embedding_feature_a]
642  ```
643
644  Args:
645    features: A dictionary of features.
646    feature_columns: An iterable containing all the feature columns. All items
647      should be instances of classes derived from _FeatureColumn.
648
649  Returns:
650    A `dict` mapping FeatureColumn to `Tensor` and `SparseTensor` values.
651  """
652  columns_to_tensor = features.copy()
653  check_feature_columns(feature_columns)
654  transformer = _Transformer(columns_to_tensor)
655  for column in sorted(set(feature_columns), key=lambda x: x.key):
656    transformer.transform(column)
657  keys = list(columns_to_tensor.keys())
658  for k in keys:
659    if k not in feature_columns:
660      columns_to_tensor.pop(k)
661  return columns_to_tensor
662
663
664def parse_feature_columns_from_sequence_examples(
665    serialized,
666    context_feature_columns,
667    sequence_feature_columns,
668    name=None,
669    example_name=None):
670  """Parses tf.SequenceExamples to extract tensors for given `FeatureColumn`s.
671
672  Args:
673    serialized: A scalar (0-D Tensor) of type string, a single serialized
674      `SequenceExample` proto.
675    context_feature_columns: An iterable containing the feature columns for
676      context features. All items should be instances of classes derived from
677      `_FeatureColumn`. Can be `None`.
678    sequence_feature_columns: An iterable containing the feature columns for
679      sequence features. All items should be instances of classes derived from
680      `_FeatureColumn`. Can be `None`.
681    name: A name for this operation (optional).
682    example_name: A scalar (0-D Tensor) of type string (optional), the names of
683      the serialized proto.
684
685  Returns:
686    A tuple consisting of (context_features, sequence_features)
687
688    *  context_features: a dict mapping `FeatureColumns` from
689        `context_feature_columns` to their parsed `Tensors`/`SparseTensor`s.
690    *  sequence_features: a dict mapping `FeatureColumns` from
691        `sequence_feature_columns` to their parsed `Tensors`/`SparseTensor`s.
692  """
693  # Sequence example parsing requires a single (scalar) example.
694  try:
695    serialized = array_ops.reshape(serialized, [])
696  except ValueError as e:
697    raise ValueError(
698        'serialized must contain as single sequence example. Batching must be '
699        'done after parsing for sequence examples. Error: {}'.format(e))
700
701  if context_feature_columns is None:
702    context_feature_columns = []
703  if sequence_feature_columns is None:
704    sequence_feature_columns = []
705
706  check_feature_columns(context_feature_columns)
707  context_feature_spec = fc.create_feature_spec_for_parsing(
708      context_feature_columns)
709
710  check_feature_columns(sequence_feature_columns)
711  sequence_feature_spec = fc._create_sequence_feature_spec_for_parsing(  # pylint: disable=protected-access
712      sequence_feature_columns, allow_missing_by_default=False)
713
714  return parsing_ops.parse_single_sequence_example(serialized,
715                                                   context_feature_spec,
716                                                   sequence_feature_spec,
717                                                   example_name,
718                                                   name)
719
720
721def _log_variable(variable):
722  if isinstance(variable, list):
723    for var in variable:
724      if fc._is_variable(variable):  # pylint: disable=protected-access
725        logging.info('Created variable %s, with device=%s', var.name,
726                     var.device)
727  elif fc._is_variable(variable):  # pylint: disable=protected-access
728    logging.info('Created variable %s, with device=%s', variable.name,
729                 variable.device)
730
731
732def _infer_real_valued_column_for_tensor(name, tensor):
733  """Creates a real_valued_column for given tensor and name."""
734  if isinstance(tensor, sparse_tensor_py.SparseTensor):
735    raise ValueError(
736        'SparseTensor is not supported for auto detection. Please define '
737        'corresponding FeatureColumn for tensor {} {}.', name, tensor)
738
739  if not (tensor.dtype.is_integer or tensor.dtype.is_floating):
740    raise ValueError(
741        'Non integer or non floating types are not supported for auto detection'
742        '. Please define corresponding FeatureColumn for tensor {} {}.', name,
743        tensor)
744
745  shape = tensor.get_shape().as_list()
746  dimension = 1
747  for i in range(1, len(shape)):
748    dimension *= shape[i]
749  return fc.real_valued_column(name, dimension=dimension, dtype=tensor.dtype)
750
751
752def infer_real_valued_columns(features):
753  if not isinstance(features, dict):
754    return [_infer_real_valued_column_for_tensor('', features)]
755
756  feature_columns = []
757  for key, value in features.items():
758    feature_columns.append(_infer_real_valued_column_for_tensor(key, value))
759
760  return feature_columns
761
762
763def check_feature_columns(feature_columns):
764  """Checks the validity of the set of FeatureColumns.
765
766  Args:
767    feature_columns: An iterable of instances or subclasses of FeatureColumn.
768
769  Raises:
770    ValueError: If `feature_columns` is a dict.
771    ValueError: If there are duplicate feature column keys.
772  """
773  if isinstance(feature_columns, dict):
774    raise ValueError('Expected feature_columns to be iterable, found dict.')
775  seen_keys = set()
776  for f in feature_columns:
777    key = f.key
778    if key in seen_keys:
779      raise ValueError('Duplicate feature column key found for column: {}. '
780                       'This usually means that the column is almost identical '
781                       'to another column, and one must be discarded.'.format(
782                           f.name))
783    seen_keys.add(key)
784
785
786class _Transformer(object):
787  """Handles all the transformations defined by FeatureColumn if needed.
788
789  FeatureColumn specifies how to digest an input column to the network. Some
790  feature columns require data transformations. This class handles those
791  transformations if they are not handled already.
792
793  Some features may be used in more than one place. For example, one can use a
794  bucketized feature by itself and a cross with it. In that case Transformer
795  should create only one bucketization op instead of multiple ops for each
796  feature column. To handle re-use of transformed columns, Transformer keeps all
797  previously transformed columns.
798
799  Example:
800
801  ```python
802    sparse_feature = sparse_column_with_hash_bucket(...)
803    real_valued_feature = real_valued_column(...)
804    real_valued_buckets = bucketized_column(source_column=real_valued_feature,
805                                            ...)
806    sparse_x_real = crossed_column(
807        columns=[sparse_feature, real_valued_buckets], hash_bucket_size=10000)
808
809    columns_to_tensor = tf.parse_example(...)
810    transformer = Transformer(columns_to_tensor)
811
812    sparse_x_real_tensor = transformer.transform(sparse_x_real)
813    sparse_tensor = transformer.transform(sparse_feature)
814    real_buckets_tensor = transformer.transform(real_valued_buckets)
815  ```
816  """
817
818  def __init__(self, columns_to_tensors):
819    """Initializes transformer.
820
821    Args:
822      columns_to_tensors: A mapping from feature columns to tensors. 'string'
823        key means a base feature (not-transformed). It can have FeatureColumn as
824        a key too. That means that FeatureColumn is already transformed by input
825        pipeline. For example, `inflow` may have handled transformations.
826        Transformed features are inserted in columns_to_tensors.
827    """
828    self._columns_to_tensors = columns_to_tensors
829
830  def transform(self, feature_column):
831    """Returns a Tensor which represents given feature_column.
832
833    Args:
834      feature_column: An instance of FeatureColumn.
835
836    Returns:
837      A Tensor which represents given feature_column. It may create a new Tensor
838      or re-use an existing one.
839
840    Raises:
841      ValueError: if FeatureColumn cannot be handled by this Transformer.
842    """
843    logging.debug('Transforming feature_column %s', feature_column)
844    if feature_column in self._columns_to_tensors:
845      # Feature_column is already transformed.
846      return self._columns_to_tensors[feature_column]
847
848    feature_column.insert_transformed_feature(self._columns_to_tensors)
849
850    if feature_column not in self._columns_to_tensors:
851      raise ValueError('Column {} is not supported.'.format(
852          feature_column.name))
853
854    return self._columns_to_tensors[feature_column]
855
856
857def _add_variable_collection(weight_collections):
858  if weight_collections:
859    weight_collections = list(
860        set(list(weight_collections) + [ops.GraphKeys.GLOBAL_VARIABLES]))
861  return weight_collections
862
863
864# TODO(jamieas): remove the following logic once all FeatureColumn types are
865# supported for sequences.
866# pylint: disable=protected-access
867_SUPPORTED_SEQUENCE_COLUMNS = (fc._OneHotColumn,
868                               fc._EmbeddingColumn,
869                               fc._RealValuedColumn,
870                               fc._RealValuedVarLenColumn)
871
872_FORBIDDEN_SEQUENCE_COLUMNS = (fc._ScatteredEmbeddingColumn,
873                               fc._BucketizedColumn,
874                               fc._CrossedColumn)
875
876
877def _check_supported_sequence_columns(feature_columns):
878  """Asserts `feature_columns` are in `_SUPPORTED_SEQUENCE_COLUMNS`."""
879  for col in feature_columns:
880    if not isinstance(col, _SUPPORTED_SEQUENCE_COLUMNS):
881      raise ValueError(
882          'FeatureColumn type {} is not currently supported for sequence data.'.
883          format(type(col).__name__))
884
885
886def _get_parent_columns(feature_column):
887  """Returns the tuple of `FeatureColumn`s that `feature_column` depends on."""
888  if isinstance(feature_column, (fc._WeightedSparseColumn,
889                                 fc._OneHotColumn,
890                                 fc._EmbeddingColumn,)):
891    return (feature_column.sparse_id_column,)
892  if isinstance(feature_column, (fc._BucketizedColumn,)):
893    return (feature_column.source_column,)
894  if isinstance(feature_column, (fc._CrossedColumn)):
895    return tuple(feature_column.columns)
896  return tuple()
897
898
899def _gather_feature_columns(feature_columns):
900  """Returns a list of all ancestor `FeatureColumns` of `feature_columns`."""
901  gathered = list(feature_columns)
902  i = 0
903  while i < len(gathered):
904    for column in _get_parent_columns(gathered[i]):
905      if column not in gathered:
906        gathered.append(column)
907    i += 1
908  return gathered
909
910
911def _check_forbidden_sequence_columns(feature_columns):
912  """Recursively checks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`."""
913  all_feature_columns = _gather_feature_columns(feature_columns)
914  for feature_column in all_feature_columns:
915    if isinstance(feature_column, _FORBIDDEN_SEQUENCE_COLUMNS):
916      raise ValueError(
917          'Column {} is of type {}, which is not currently supported for '
918          'sequences.'.format(feature_column.name,
919                              type(feature_column).__name__))
920