1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Tests for text data preprocessing utils.""" 16 17from __future__ import absolute_import 18from __future__ import division 19from __future__ import print_function 20 21import numpy as np 22 23from tensorflow.python.keras._impl import keras 24from tensorflow.python.platform import test 25 26 27class TestText(test.TestCase): 28 29 def test_one_hot(self): 30 text = 'The cat sat on the mat.' 31 encoded = keras.preprocessing.text.one_hot(text, 5) 32 self.assertEqual(len(encoded), 6) 33 self.assertLessEqual(np.max(encoded), 4) 34 self.assertGreaterEqual(np.min(encoded), 0) 35 36 # Test on unicode. 37 text = u'The cat sat on the mat.' 38 encoded = keras.preprocessing.text.one_hot(text, 5) 39 self.assertEqual(len(encoded), 6) 40 self.assertLessEqual(np.max(encoded), 4) 41 self.assertGreaterEqual(np.min(encoded), 0) 42 43 def test_tokenizer(self): 44 texts = [ 45 'The cat sat on the mat.', 46 'The dog sat on the log.', 47 'Dogs and cats living together.' 48 ] 49 tokenizer = keras.preprocessing.text.Tokenizer(num_words=10) 50 tokenizer.fit_on_texts(texts) 51 52 sequences = [] 53 for seq in tokenizer.texts_to_sequences_generator(texts): 54 sequences.append(seq) 55 self.assertLess(np.max(np.max(sequences)), 10) 56 self.assertEqual(np.min(np.min(sequences)), 1) 57 58 tokenizer.fit_on_sequences(sequences) 59 60 for mode in ['binary', 'count', 'tfidf', 'freq']: 61 matrix = tokenizer.texts_to_matrix(texts, mode) 62 self.assertEqual(matrix.shape, (3, 10)) 63 64 def test_hashing_trick_hash(self): 65 text = 'The cat sat on the mat.' 66 encoded = keras.preprocessing.text.hashing_trick(text, 5) 67 self.assertEqual(len(encoded), 6) 68 self.assertLessEqual(np.max(encoded), 4) 69 self.assertGreaterEqual(np.min(encoded), 1) 70 71 def test_hashing_trick_md5(self): 72 text = 'The cat sat on the mat.' 73 encoded = keras.preprocessing.text.hashing_trick( 74 text, 5, hash_function='md5') 75 self.assertEqual(len(encoded), 6) 76 self.assertLessEqual(np.max(encoded), 4) 77 self.assertGreaterEqual(np.min(encoded), 1) 78 79 def test_tokenizer_oov_flag(self): 80 x_train = ['This text has only known words'] 81 x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown 82 83 # Defalut, without OOV flag 84 tokenizer = keras.preprocessing.text.Tokenizer() 85 tokenizer.fit_on_texts(x_train) 86 x_test_seq = tokenizer.texts_to_sequences(x_test) 87 assert len(x_test_seq[0]) == 4 # discards 2 OOVs 88 89 # With OOV feature 90 tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<unk>') 91 tokenizer.fit_on_texts(x_train) 92 x_test_seq = tokenizer.texts_to_sequences(x_test) 93 assert len(x_test_seq[0]) == 6 # OOVs marked in place 94 95 96if __name__ == '__main__': 97 test.main() 98