1# -*- coding: utf-8 -*- 2# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# ============================================================================== 16"""Tests for text data preprocessing utils.""" 17 18from __future__ import absolute_import 19from __future__ import division 20from __future__ import print_function 21 22import numpy as np 23 24from tensorflow.python.keras.preprocessing import text as preprocessing_text 25from tensorflow.python.platform import test 26 27 28class TestText(test.TestCase): 29 30 def test_one_hot(self): 31 text = 'The cat sat on the mat.' 32 encoded = preprocessing_text.one_hot(text, 5) 33 self.assertEqual(len(encoded), 6) 34 self.assertLessEqual(np.max(encoded), 4) 35 self.assertGreaterEqual(np.min(encoded), 0) 36 37 # Test on unicode. 38 text = u'The cat sat on the mat.' 39 encoded = preprocessing_text.one_hot(text, 5) 40 self.assertEqual(len(encoded), 6) 41 self.assertLessEqual(np.max(encoded), 4) 42 self.assertGreaterEqual(np.min(encoded), 0) 43 44 def test_tokenizer(self): 45 texts = [ 46 'The cat sat on the mat.', 47 'The dog sat on the log.', 48 'Dogs and cats living together.' 49 ] 50 tokenizer = preprocessing_text.Tokenizer(num_words=10) 51 tokenizer.fit_on_texts(texts) 52 53 sequences = [] 54 for seq in tokenizer.texts_to_sequences_generator(texts): 55 sequences.append(seq) 56 self.assertLess(np.max(np.max(sequences)), 10) 57 self.assertEqual(np.min(np.min(sequences)), 1) 58 59 tokenizer.fit_on_sequences(sequences) 60 61 for mode in ['binary', 'count', 'tfidf', 'freq']: 62 matrix = tokenizer.texts_to_matrix(texts, mode) 63 self.assertEqual(matrix.shape, (3, 10)) 64 65 def test_hashing_trick_hash(self): 66 text = 'The cat sat on the mat.' 67 encoded = preprocessing_text.hashing_trick(text, 5) 68 self.assertEqual(len(encoded), 6) 69 self.assertLessEqual(np.max(encoded), 4) 70 self.assertGreaterEqual(np.min(encoded), 1) 71 72 def test_hashing_trick_md5(self): 73 text = 'The cat sat on the mat.' 74 encoded = preprocessing_text.hashing_trick( 75 text, 5, hash_function='md5') 76 self.assertEqual(len(encoded), 6) 77 self.assertLessEqual(np.max(encoded), 4) 78 self.assertGreaterEqual(np.min(encoded), 1) 79 80 def test_tokenizer_oov_flag(self): 81 x_train = ['This text has only known words'] 82 x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown 83 84 # Default, without OOV flag 85 tokenizer = preprocessing_text.Tokenizer() 86 tokenizer.fit_on_texts(x_train) 87 x_test_seq = tokenizer.texts_to_sequences(x_test) 88 self.assertEqual(len(x_test_seq[0]), 4) # discards 2 OOVs 89 90 # With OOV feature 91 tokenizer = preprocessing_text.Tokenizer(oov_token='<unk>') 92 tokenizer.fit_on_texts(x_train) 93 x_test_seq = tokenizer.texts_to_sequences(x_test) 94 self.assertEqual(len(x_test_seq[0]), 6) # OOVs marked in place 95 96 def test_sequential_fit(self): 97 texts = [ 98 'The cat sat on the mat.', 'The dog sat on the log.', 99 'Dogs and cats living together.' 100 ] 101 word_sequences = [['The', 'cat', 'is', 'sitting'], 102 ['The', 'dog', 'is', 'standing']] 103 tokenizer = preprocessing_text.Tokenizer() 104 tokenizer.fit_on_texts(texts) 105 tokenizer.fit_on_texts(word_sequences) 106 107 self.assertEqual(tokenizer.document_count, 5) 108 109 tokenizer.texts_to_matrix(texts) 110 tokenizer.texts_to_matrix(word_sequences) 111 112 def test_text_to_word_sequence(self): 113 text = 'hello! ? world!' 114 seq = preprocessing_text.text_to_word_sequence(text) 115 self.assertEqual(seq, ['hello', 'world']) 116 117 def test_text_to_word_sequence_multichar_split(self): 118 text = 'hello!stop?world!' 119 seq = preprocessing_text.text_to_word_sequence(text, split='stop') 120 self.assertEqual(seq, ['hello', 'world']) 121 122 def test_text_to_word_sequence_unicode(self): 123 text = u'ali! veli? kırk dokuz elli' 124 seq = preprocessing_text.text_to_word_sequence(text) 125 self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli']) 126 127 def test_text_to_word_sequence_unicode_multichar_split(self): 128 text = u'ali!stopveli?stopkırkstopdokuzstopelli' 129 seq = preprocessing_text.text_to_word_sequence(text, split='stop') 130 self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli']) 131 132 def test_tokenizer_unicode(self): 133 texts = [ 134 u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz' 135 ] 136 tokenizer = preprocessing_text.Tokenizer(num_words=5) 137 tokenizer.fit_on_texts(texts) 138 139 self.assertEqual(len(tokenizer.word_counts), 5) 140 141 142if __name__ == '__main__': 143 test.main() 144