1# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Tests for text data preprocessing utils."""
16
17from __future__ import absolute_import
18from __future__ import division
19from __future__ import print_function
20
21import numpy as np
22
23from tensorflow.python.keras._impl import keras
24from tensorflow.python.platform import test
25
26
27class TestText(test.TestCase):
28
29  def test_one_hot(self):
30    text = 'The cat sat on the mat.'
31    encoded = keras.preprocessing.text.one_hot(text, 5)
32    self.assertEqual(len(encoded), 6)
33    self.assertLessEqual(np.max(encoded), 4)
34    self.assertGreaterEqual(np.min(encoded), 0)
35
36    # Test on unicode.
37    text = u'The cat sat on the mat.'
38    encoded = keras.preprocessing.text.one_hot(text, 5)
39    self.assertEqual(len(encoded), 6)
40    self.assertLessEqual(np.max(encoded), 4)
41    self.assertGreaterEqual(np.min(encoded), 0)
42
43  def test_tokenizer(self):
44    texts = [
45        'The cat sat on the mat.',
46        'The dog sat on the log.',
47        'Dogs and cats living together.'
48    ]
49    tokenizer = keras.preprocessing.text.Tokenizer(num_words=10)
50    tokenizer.fit_on_texts(texts)
51
52    sequences = []
53    for seq in tokenizer.texts_to_sequences_generator(texts):
54      sequences.append(seq)
55    self.assertLess(np.max(np.max(sequences)), 10)
56    self.assertEqual(np.min(np.min(sequences)), 1)
57
58    tokenizer.fit_on_sequences(sequences)
59
60    for mode in ['binary', 'count', 'tfidf', 'freq']:
61      matrix = tokenizer.texts_to_matrix(texts, mode)
62      self.assertEqual(matrix.shape, (3, 10))
63
64  def test_hashing_trick_hash(self):
65    text = 'The cat sat on the mat.'
66    encoded = keras.preprocessing.text.hashing_trick(text, 5)
67    self.assertEqual(len(encoded), 6)
68    self.assertLessEqual(np.max(encoded), 4)
69    self.assertGreaterEqual(np.min(encoded), 1)
70
71  def test_hashing_trick_md5(self):
72    text = 'The cat sat on the mat.'
73    encoded = keras.preprocessing.text.hashing_trick(
74        text, 5, hash_function='md5')
75    self.assertEqual(len(encoded), 6)
76    self.assertLessEqual(np.max(encoded), 4)
77    self.assertGreaterEqual(np.min(encoded), 1)
78
79  def test_tokenizer_oov_flag(self):
80    x_train = ['This text has only known words']
81    x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown
82
83    # Defalut, without OOV flag
84    tokenizer = keras.preprocessing.text.Tokenizer()
85    tokenizer.fit_on_texts(x_train)
86    x_test_seq = tokenizer.texts_to_sequences(x_test)
87    assert len(x_test_seq[0]) == 4  # discards 2 OOVs
88
89    # With OOV feature
90    tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<unk>')
91    tokenizer.fit_on_texts(x_train)
92    x_test_seq = tokenizer.texts_to_sequences(x_test)
93    assert len(x_test_seq[0]) == 6  # OOVs marked in place
94
95
96if __name__ == '__main__':
97  test.main()
98