1# -*- coding: utf-8 -*-
2# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15# ==============================================================================
16"""Tests for text data preprocessing utils."""
17
18from __future__ import absolute_import
19from __future__ import division
20from __future__ import print_function
21
22import numpy as np
23
24from tensorflow.python.keras.preprocessing import text as preprocessing_text
25from tensorflow.python.platform import test
26
27
28class TestText(test.TestCase):
29
30  def test_one_hot(self):
31    text = 'The cat sat on the mat.'
32    encoded = preprocessing_text.one_hot(text, 5)
33    self.assertEqual(len(encoded), 6)
34    self.assertLessEqual(np.max(encoded), 4)
35    self.assertGreaterEqual(np.min(encoded), 0)
36
37    # Test on unicode.
38    text = u'The cat sat on the mat.'
39    encoded = preprocessing_text.one_hot(text, 5)
40    self.assertEqual(len(encoded), 6)
41    self.assertLessEqual(np.max(encoded), 4)
42    self.assertGreaterEqual(np.min(encoded), 0)
43
44  def test_tokenizer(self):
45    texts = [
46        'The cat sat on the mat.',
47        'The dog sat on the log.',
48        'Dogs and cats living together.'
49    ]
50    tokenizer = preprocessing_text.Tokenizer(num_words=10)
51    tokenizer.fit_on_texts(texts)
52
53    sequences = []
54    for seq in tokenizer.texts_to_sequences_generator(texts):
55      sequences.append(seq)
56    self.assertLess(np.max(np.max(sequences)), 10)
57    self.assertEqual(np.min(np.min(sequences)), 1)
58
59    tokenizer.fit_on_sequences(sequences)
60
61    for mode in ['binary', 'count', 'tfidf', 'freq']:
62      matrix = tokenizer.texts_to_matrix(texts, mode)
63      self.assertEqual(matrix.shape, (3, 10))
64
65  def test_hashing_trick_hash(self):
66    text = 'The cat sat on the mat.'
67    encoded = preprocessing_text.hashing_trick(text, 5)
68    self.assertEqual(len(encoded), 6)
69    self.assertLessEqual(np.max(encoded), 4)
70    self.assertGreaterEqual(np.min(encoded), 1)
71
72  def test_hashing_trick_md5(self):
73    text = 'The cat sat on the mat.'
74    encoded = preprocessing_text.hashing_trick(
75        text, 5, hash_function='md5')
76    self.assertEqual(len(encoded), 6)
77    self.assertLessEqual(np.max(encoded), 4)
78    self.assertGreaterEqual(np.min(encoded), 1)
79
80  def test_tokenizer_oov_flag(self):
81    x_train = ['This text has only known words']
82    x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown
83
84    # Default, without OOV flag
85    tokenizer = preprocessing_text.Tokenizer()
86    tokenizer.fit_on_texts(x_train)
87    x_test_seq = tokenizer.texts_to_sequences(x_test)
88    self.assertEqual(len(x_test_seq[0]), 4)  # discards 2 OOVs
89
90    # With OOV feature
91    tokenizer = preprocessing_text.Tokenizer(oov_token='<unk>')
92    tokenizer.fit_on_texts(x_train)
93    x_test_seq = tokenizer.texts_to_sequences(x_test)
94    self.assertEqual(len(x_test_seq[0]), 6)  # OOVs marked in place
95
96  def test_sequential_fit(self):
97    texts = [
98        'The cat sat on the mat.', 'The dog sat on the log.',
99        'Dogs and cats living together.'
100    ]
101    word_sequences = [['The', 'cat', 'is', 'sitting'],
102                      ['The', 'dog', 'is', 'standing']]
103    tokenizer = preprocessing_text.Tokenizer()
104    tokenizer.fit_on_texts(texts)
105    tokenizer.fit_on_texts(word_sequences)
106
107    self.assertEqual(tokenizer.document_count, 5)
108
109    tokenizer.texts_to_matrix(texts)
110    tokenizer.texts_to_matrix(word_sequences)
111
112  def test_text_to_word_sequence(self):
113    text = 'hello! ? world!'
114    seq = preprocessing_text.text_to_word_sequence(text)
115    self.assertEqual(seq, ['hello', 'world'])
116
117  def test_text_to_word_sequence_multichar_split(self):
118    text = 'hello!stop?world!'
119    seq = preprocessing_text.text_to_word_sequence(text, split='stop')
120    self.assertEqual(seq, ['hello', 'world'])
121
122  def test_text_to_word_sequence_unicode(self):
123    text = u'ali! veli? kırk dokuz elli'
124    seq = preprocessing_text.text_to_word_sequence(text)
125    self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
126
127  def test_text_to_word_sequence_unicode_multichar_split(self):
128    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
129    seq = preprocessing_text.text_to_word_sequence(text, split='stop')
130    self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
131
132  def test_tokenizer_unicode(self):
133    texts = [
134        u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz'
135    ]
136    tokenizer = preprocessing_text.Tokenizer(num_words=5)
137    tokenizer.fit_on_texts(texts)
138
139    self.assertEqual(len(tokenizer.word_counts), 5)
140
141
142if __name__ == '__main__':
143  test.main()
144