1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Utilities for text input preprocessing.
16"""
17# pylint: disable=invalid-name
18from __future__ import absolute_import
19from __future__ import division
20from __future__ import print_function
21
22from keras_preprocessing import text
23
24from tensorflow.python.keras.preprocessing.text_dataset import text_dataset_from_directory  # pylint: disable=unused-import
25from tensorflow.python.util.tf_export import keras_export
26
27hashing_trick = text.hashing_trick
28Tokenizer = text.Tokenizer
29
30
31@keras_export('keras.preprocessing.text.text_to_word_sequence')
32def text_to_word_sequence(input_text,
33                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
34                          lower=True,
35                          split=' '):
36  """Converts a text to a sequence of words (or tokens).
37
38  This function transforms a string of text into a list of words
39  while ignoring `filters` which include punctuations by default.
40
41  >>> sample_text = 'This is a sample sentence.'
42  >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text)
43  ['this', 'is', 'a', 'sample', 'sentence']
44
45  Args:
46      input_text: Input text (string).
47      filters: list (or concatenation) of characters to filter out, such as
48          punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'``,
49            includes basic punctuation, tabs, and newlines.
50      lower: boolean. Whether to convert the input to lowercase.
51      split: str. Separator for word splitting.
52
53  Returns:
54      A list of words (or tokens).
55  """
56  return text.text_to_word_sequence(
57      input_text, filters=filters, lower=lower, split=split)
58
59
60@keras_export('keras.preprocessing.text.one_hot')
61def one_hot(input_text,
62            n,
63            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
64            lower=True,
65            split=' '):
66  r"""One-hot encodes a text into a list of word indexes of size `n`.
67
68  This function receives as input a string of text and returns a
69  list of encoded integers each corresponding to a word (or token)
70  in the given input string.
71
72  Args:
73      input_text: Input text (string).
74      n: int. Size of vocabulary.
75      filters: list (or concatenation) of characters to filter out, such as
76        punctuation. Default:
77        ```
78        '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n
79        ```,
80        includes basic punctuation, tabs, and newlines.
81      lower: boolean. Whether to set the text to lowercase.
82      split: str. Separator for word splitting.
83
84  Returns:
85      List of integers in `[1, n]`. Each integer encodes a word
86      (unicity non-guaranteed).
87  """
88  return text.one_hot(input_text, n, filters=filters, lower=lower, split=split)
89
90
91# text.tokenizer_from_json is only available if keras_preprocessing >= 1.1.0
92try:
93  tokenizer_from_json = text.tokenizer_from_json
94  keras_export('keras.preprocessing.text.tokenizer_from_json')(
95      tokenizer_from_json)
96except AttributeError:
97  pass
98
99keras_export('keras.preprocessing.text.hashing_trick')(hashing_trick)
100keras_export('keras.preprocessing.text.Tokenizer')(Tokenizer)
101