1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Utilities for text input preprocessing. 16""" 17# pylint: disable=invalid-name 18from __future__ import absolute_import 19from __future__ import division 20from __future__ import print_function 21 22from keras_preprocessing import text 23 24from tensorflow.python.keras.preprocessing.text_dataset import text_dataset_from_directory # pylint: disable=unused-import 25from tensorflow.python.util.tf_export import keras_export 26 27hashing_trick = text.hashing_trick 28Tokenizer = text.Tokenizer 29 30 31@keras_export('keras.preprocessing.text.text_to_word_sequence') 32def text_to_word_sequence(input_text, 33 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 34 lower=True, 35 split=' '): 36 """Converts a text to a sequence of words (or tokens). 37 38 This function transforms a string of text into a list of words 39 while ignoring `filters` which include punctuations by default. 40 41 >>> sample_text = 'This is a sample sentence.' 42 >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text) 43 ['this', 'is', 'a', 'sample', 'sentence'] 44 45 Args: 46 input_text: Input text (string). 47 filters: list (or concatenation) of characters to filter out, such as 48 punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'``, 49 includes basic punctuation, tabs, and newlines. 50 lower: boolean. Whether to convert the input to lowercase. 51 split: str. Separator for word splitting. 52 53 Returns: 54 A list of words (or tokens). 55 """ 56 return text.text_to_word_sequence( 57 input_text, filters=filters, lower=lower, split=split) 58 59 60@keras_export('keras.preprocessing.text.one_hot') 61def one_hot(input_text, 62 n, 63 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 64 lower=True, 65 split=' '): 66 r"""One-hot encodes a text into a list of word indexes of size `n`. 67 68 This function receives as input a string of text and returns a 69 list of encoded integers each corresponding to a word (or token) 70 in the given input string. 71 72 Args: 73 input_text: Input text (string). 74 n: int. Size of vocabulary. 75 filters: list (or concatenation) of characters to filter out, such as 76 punctuation. Default: 77 ``` 78 '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n 79 ```, 80 includes basic punctuation, tabs, and newlines. 81 lower: boolean. Whether to set the text to lowercase. 82 split: str. Separator for word splitting. 83 84 Returns: 85 List of integers in `[1, n]`. Each integer encodes a word 86 (unicity non-guaranteed). 87 """ 88 return text.one_hot(input_text, n, filters=filters, lower=lower, split=split) 89 90 91# text.tokenizer_from_json is only available if keras_preprocessing >= 1.1.0 92try: 93 tokenizer_from_json = text.tokenizer_from_json 94 keras_export('keras.preprocessing.text.tokenizer_from_json')( 95 tokenizer_from_json) 96except AttributeError: 97 pass 98 99keras_export('keras.preprocessing.text.hashing_trick')(hashing_trick) 100keras_export('keras.preprocessing.text.Tokenizer')(Tokenizer) 101