1# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""This API defines FeatureColumn for sequential input.
16
17NOTE: This API is a work in progress and will likely be changing frequently.
18"""
19
20from __future__ import absolute_import
21from __future__ import division
22from __future__ import print_function
23
24
25import collections
26
27
28from tensorflow.python.feature_column import feature_column_v2 as fc
29from tensorflow.python.feature_column import utils as fc_utils
30from tensorflow.python.framework import dtypes
31from tensorflow.python.framework import ops
32from tensorflow.python.framework import tensor_shape
33from tensorflow.python.ops import array_ops
34from tensorflow.python.ops import check_ops
35from tensorflow.python.ops import parsing_ops
36from tensorflow.python.ops import sparse_ops
37from tensorflow.python.util.tf_export import tf_export
38
39
40# pylint: disable=protected-access
41def concatenate_context_input(context_input, sequence_input):
42  """Replicates `context_input` across all timesteps of `sequence_input`.
43
44  Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
45  This value is appended to `sequence_input` on dimension 2 and the result is
46  returned.
47
48  Args:
49    context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`.
50    sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size,
51      padded_length, d0]`.
52
53  Returns:
54    A `Tensor` of dtype `float32` and shape `[batch_size, padded_length,
55    d0 + d1]`.
56
57  Raises:
58    ValueError: If `sequence_input` does not have rank 3 or `context_input` does
59      not have rank 2.
60  """
61  seq_rank_check = check_ops.assert_rank(
62      sequence_input,
63      3,
64      message='sequence_input must have rank 3',
65      data=[array_ops.shape(sequence_input)])
66  seq_type_check = check_ops.assert_type(
67      sequence_input,
68      dtypes.float32,
69      message='sequence_input must have dtype float32; got {}.'.format(
70          sequence_input.dtype))
71  ctx_rank_check = check_ops.assert_rank(
72      context_input,
73      2,
74      message='context_input must have rank 2',
75      data=[array_ops.shape(context_input)])
76  ctx_type_check = check_ops.assert_type(
77      context_input,
78      dtypes.float32,
79      message='context_input must have dtype float32; got {}.'.format(
80          context_input.dtype))
81  with ops.control_dependencies(
82      [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]):
83    padded_length = array_ops.shape(sequence_input)[1]
84    tiled_context_input = array_ops.tile(
85        array_ops.expand_dims(context_input, 1),
86        array_ops.concat([[1], [padded_length], [1]], 0))
87  return array_ops.concat([sequence_input, tiled_context_input], 2)
88
89
90@tf_export('feature_column.sequence_categorical_column_with_identity')
91def sequence_categorical_column_with_identity(
92    key, num_buckets, default_value=None):
93  """Returns a feature column that represents sequences of integers.
94
95  Pass this to `embedding_column` or `indicator_column` to convert sequence
96  categorical data into dense representation for input to sequence NN, such as
97  RNN.
98
99  Example:
100
101  ```python
102  watches = sequence_categorical_column_with_identity(
103      'watches', num_buckets=1000)
104  watches_embedding = embedding_column(watches, dimension=10)
105  columns = [watches_embedding]
106
107  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
108  sequence_feature_layer = SequenceFeatures(columns)
109  sequence_input, sequence_length = sequence_feature_layer(features)
110  sequence_length_mask = tf.sequence_mask(sequence_length)
111
112  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
113  rnn_layer = tf.keras.layers.RNN(rnn_cell)
114  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
115  ```
116
117  Args:
118    key: A unique string identifying the input feature.
119    num_buckets: Range of inputs. Namely, inputs are expected to be in the
120      range `[0, num_buckets)`.
121    default_value: If `None`, this column's graph operations will fail for
122      out-of-range inputs. Otherwise, this value must be in the range
123      `[0, num_buckets)`, and will replace out-of-range inputs.
124
125  Returns:
126    A `SequenceCategoricalColumn`.
127
128  Raises:
129    ValueError: if `num_buckets` is less than one.
130    ValueError: if `default_value` is not in range `[0, num_buckets)`.
131  """
132  return fc.SequenceCategoricalColumn(
133      fc.categorical_column_with_identity(
134          key=key,
135          num_buckets=num_buckets,
136          default_value=default_value))
137
138
139@tf_export('feature_column.sequence_categorical_column_with_hash_bucket')
140def sequence_categorical_column_with_hash_bucket(
141    key, hash_bucket_size, dtype=dtypes.string):
142  """A sequence of categorical terms where ids are set by hashing.
143
144  Pass this to `embedding_column` or `indicator_column` to convert sequence
145  categorical data into dense representation for input to sequence NN, such as
146  RNN.
147
148  Example:
149
150  ```python
151  tokens = sequence_categorical_column_with_hash_bucket(
152      'tokens', hash_bucket_size=1000)
153  tokens_embedding = embedding_column(tokens, dimension=10)
154  columns = [tokens_embedding]
155
156  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
157  sequence_feature_layer = SequenceFeatures(columns)
158  sequence_input, sequence_length = sequence_feature_layer(features)
159  sequence_length_mask = tf.sequence_mask(sequence_length)
160
161  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
162  rnn_layer = tf.keras.layers.RNN(rnn_cell)
163  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
164  ```
165
166  Args:
167    key: A unique string identifying the input feature.
168    hash_bucket_size: An int > 1. The number of buckets.
169    dtype: The type of features. Only string and integer types are supported.
170
171  Returns:
172    A `SequenceCategoricalColumn`.
173
174  Raises:
175    ValueError: `hash_bucket_size` is not greater than 1.
176    ValueError: `dtype` is neither string nor integer.
177  """
178  return fc.SequenceCategoricalColumn(
179      fc.categorical_column_with_hash_bucket(
180          key=key,
181          hash_bucket_size=hash_bucket_size,
182          dtype=dtype))
183
184
185@tf_export('feature_column.sequence_categorical_column_with_vocabulary_file')
186def sequence_categorical_column_with_vocabulary_file(
187    key, vocabulary_file, vocabulary_size=None, num_oov_buckets=0,
188    default_value=None, dtype=dtypes.string):
189  """A sequence of categorical terms where ids use a vocabulary file.
190
191  Pass this to `embedding_column` or `indicator_column` to convert sequence
192  categorical data into dense representation for input to sequence NN, such as
193  RNN.
194
195  Example:
196
197  ```python
198  states = sequence_categorical_column_with_vocabulary_file(
199      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
200      num_oov_buckets=5)
201  states_embedding = embedding_column(states, dimension=10)
202  columns = [states_embedding]
203
204  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
205  sequence_feature_layer = SequenceFeatures(columns)
206  sequence_input, sequence_length = sequence_feature_layer(features)
207  sequence_length_mask = tf.sequence_mask(sequence_length)
208
209  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
210  rnn_layer = tf.keras.layers.RNN(rnn_cell)
211  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
212  ```
213
214  Args:
215    key: A unique string identifying the input feature.
216    vocabulary_file: The vocabulary file name.
217    vocabulary_size: Number of the elements in the vocabulary. This must be no
218      greater than length of `vocabulary_file`, if less than length, later
219      values are ignored. If None, it is set to the length of `vocabulary_file`.
220    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
221      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
222      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
223      the input value. A positive `num_oov_buckets` can not be specified with
224      `default_value`.
225    default_value: The integer ID value to return for out-of-vocabulary feature
226      values, defaults to `-1`. This can not be specified with a positive
227      `num_oov_buckets`.
228    dtype: The type of features. Only string and integer types are supported.
229
230  Returns:
231    A `SequenceCategoricalColumn`.
232
233  Raises:
234    ValueError: `vocabulary_file` is missing or cannot be opened.
235    ValueError: `vocabulary_size` is missing or < 1.
236    ValueError: `num_oov_buckets` is a negative integer.
237    ValueError: `num_oov_buckets` and `default_value` are both specified.
238    ValueError: `dtype` is neither string nor integer.
239  """
240  return fc.SequenceCategoricalColumn(
241      fc.categorical_column_with_vocabulary_file(
242          key=key,
243          vocabulary_file=vocabulary_file,
244          vocabulary_size=vocabulary_size,
245          num_oov_buckets=num_oov_buckets,
246          default_value=default_value,
247          dtype=dtype))
248
249
250@tf_export('feature_column.sequence_categorical_column_with_vocabulary_list')
251def sequence_categorical_column_with_vocabulary_list(
252    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
253  """A sequence of categorical terms where ids use an in-memory list.
254
255  Pass this to `embedding_column` or `indicator_column` to convert sequence
256  categorical data into dense representation for input to sequence NN, such as
257  RNN.
258
259  Example:
260
261  ```python
262  colors = sequence_categorical_column_with_vocabulary_list(
263      key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
264      num_oov_buckets=2)
265  colors_embedding = embedding_column(colors, dimension=3)
266  columns = [colors_embedding]
267
268  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
269  sequence_feature_layer = SequenceFeatures(columns)
270  sequence_input, sequence_length = sequence_feature_layer(features)
271  sequence_length_mask = tf.sequence_mask(sequence_length)
272
273  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
274  rnn_layer = tf.keras.layers.RNN(rnn_cell)
275  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
276  ```
277
278  Args:
279    key: A unique string identifying the input feature.
280    vocabulary_list: An ordered iterable defining the vocabulary. Each feature
281      is mapped to the index of its value (if present) in `vocabulary_list`.
282      Must be castable to `dtype`.
283    dtype: The type of features. Only string and integer types are supported.
284      If `None`, it will be inferred from `vocabulary_list`.
285    default_value: The integer ID value to return for out-of-vocabulary feature
286      values, defaults to `-1`. This can not be specified with a positive
287      `num_oov_buckets`.
288    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
289      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
290      `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
291      hash of the input value. A positive `num_oov_buckets` can not be specified
292      with `default_value`.
293
294  Returns:
295    A `SequenceCategoricalColumn`.
296
297  Raises:
298    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
299    ValueError: `num_oov_buckets` is a negative integer.
300    ValueError: `num_oov_buckets` and `default_value` are both specified.
301    ValueError: if `dtype` is not integer or string.
302  """
303  return fc.SequenceCategoricalColumn(
304      fc.categorical_column_with_vocabulary_list(
305          key=key,
306          vocabulary_list=vocabulary_list,
307          dtype=dtype,
308          default_value=default_value,
309          num_oov_buckets=num_oov_buckets))
310
311
312@tf_export('feature_column.sequence_numeric_column')
313def sequence_numeric_column(
314    key,
315    shape=(1,),
316    default_value=0.,
317    dtype=dtypes.float32,
318    normalizer_fn=None):
319  """Returns a feature column that represents sequences of numeric data.
320
321  Example:
322
323  ```python
324  temperature = sequence_numeric_column('temperature')
325  columns = [temperature]
326
327  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
328  sequence_feature_layer = SequenceFeatures(columns)
329  sequence_input, sequence_length = sequence_feature_layer(features)
330  sequence_length_mask = tf.sequence_mask(sequence_length)
331
332  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
333  rnn_layer = tf.keras.layers.RNN(rnn_cell)
334  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
335  ```
336
337  Args:
338    key: A unique string identifying the input features.
339    shape: The shape of the input data per sequence id. E.g. if `shape=(2,)`,
340      each example must contain `2 * sequence_length` values.
341    default_value: A single value compatible with `dtype` that is used for
342      padding the sparse data into a dense `Tensor`.
343    dtype: The type of values.
344    normalizer_fn: If not `None`, a function that can be used to normalize the
345      value of the tensor after `default_value` is applied for parsing.
346      Normalizer function takes the input `Tensor` as its argument, and returns
347      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
348      even though the most common use case of this function is normalization, it
349      can be used for any kind of Tensorflow transformations.
350
351  Returns:
352    A `SequenceNumericColumn`.
353
354  Raises:
355    TypeError: if any dimension in shape is not an int.
356    ValueError: if any dimension in shape is not a positive integer.
357    ValueError: if `dtype` is not convertible to `tf.float32`.
358  """
359  shape = fc._check_shape(shape=shape, key=key)
360  if not (dtype.is_integer or dtype.is_floating):
361    raise ValueError('dtype must be convertible to float. '
362                     'dtype: {}, key: {}'.format(dtype, key))
363  if normalizer_fn is not None and not callable(normalizer_fn):
364    raise TypeError(
365        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
366
367  return SequenceNumericColumn(
368      key,
369      shape=shape,
370      default_value=default_value,
371      dtype=dtype,
372      normalizer_fn=normalizer_fn)
373
374
375def _assert_all_equal_and_return(tensors, name=None):
376  """Asserts that all tensors are equal and returns the first one."""
377  with ops.name_scope(name, 'assert_all_equal', values=tensors):
378    if len(tensors) == 1:
379      return tensors[0]
380    assert_equal_ops = []
381    for t in tensors[1:]:
382      assert_equal_ops.append(check_ops.assert_equal(tensors[0], t))
383    with ops.control_dependencies(assert_equal_ops):
384      return array_ops.identity(tensors[0])
385
386
387class SequenceNumericColumn(
388    fc.SequenceDenseColumn,
389    collections.namedtuple(
390        'SequenceNumericColumn',
391        ('key', 'shape', 'default_value', 'dtype', 'normalizer_fn'))):
392  """Represents sequences of numeric data."""
393
394  @property
395  def _is_v2_column(self):
396    return True
397
398  @property
399  def name(self):
400    """See `FeatureColumn` base class."""
401    return self.key
402
403  @property
404  def parse_example_spec(self):
405    """See `FeatureColumn` base class."""
406    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
407
408  def transform_feature(self, transformation_cache, state_manager):
409    """See `FeatureColumn` base class.
410
411    In this case, we apply the `normalizer_fn` to the input tensor.
412
413    Args:
414      transformation_cache: A `FeatureTransformationCache` object to access
415        features.
416      state_manager: A `StateManager` to create / access resources such as
417        lookup tables.
418
419    Returns:
420      Normalized input tensor.
421    """
422    input_tensor = transformation_cache.get(self.key, state_manager)
423    if self.normalizer_fn is not None:
424      input_tensor = self.normalizer_fn(input_tensor)
425    return input_tensor
426
427  @property
428  def variable_shape(self):
429    """Returns a `TensorShape` representing the shape of sequence input."""
430    return tensor_shape.TensorShape(self.shape)
431
432  def get_sequence_dense_tensor(self, transformation_cache, state_manager):
433    """Returns a `TensorSequenceLengthPair`.
434
435    Args:
436      transformation_cache: A `FeatureTransformationCache` object to access
437        features.
438      state_manager: A `StateManager` to create / access resources such as
439        lookup tables.
440    """
441    sp_tensor = transformation_cache.get(self, state_manager)
442    dense_tensor = sparse_ops.sparse_tensor_to_dense(
443        sp_tensor, default_value=self.default_value)
444    # Reshape into [batch_size, T, variable_shape].
445    dense_shape = array_ops.concat(
446        [array_ops.shape(dense_tensor)[:1], [-1], self.variable_shape],
447        axis=0)
448    dense_tensor = array_ops.reshape(dense_tensor, shape=dense_shape)
449
450    # Get the number of timesteps per example
451    # For the 2D case, the raw values are grouped according to num_elements;
452    # for the 3D case, the grouping happens in the third dimension, and
453    # sequence length is not affected.
454    if sp_tensor.shape.ndims == 2:
455      num_elements = self.variable_shape.num_elements()
456    else:
457      num_elements = 1
458    seq_length = fc_utils.sequence_length_from_sparse_tensor(
459        sp_tensor, num_elements=num_elements)
460
461    return fc.SequenceDenseColumn.TensorSequenceLengthPair(
462        dense_tensor=dense_tensor, sequence_length=seq_length)
463
464  @property
465  def parents(self):
466    """See 'FeatureColumn` base class."""
467    return [self.key]
468
469  def get_config(self):
470    """See 'FeatureColumn` base class."""
471    config = dict(zip(self._fields, self))
472    config['dtype'] = self.dtype.name
473    return config
474
475  @classmethod
476  def from_config(cls, config, custom_objects=None, columns_by_name=None):
477    """See 'FeatureColumn` base class."""
478    fc._check_config_keys(config, cls._fields)
479    kwargs = fc._standardize_and_copy_config(config)
480    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
481    return cls(**kwargs)
482
483
484# pylint: enable=protected-access
485