text.py revision 24101b35f3baebbfff3d8057ac223b325bc415ce
1f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# 3f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# Licensed under the Apache License, Version 2.0 (the "License"); 4f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# you may not use this file except in compliance with the License. 5f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# You may obtain a copy of the License at 6f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# 7f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# http://www.apache.org/licenses/LICENSE-2.0 8f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# 9f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# Unless required by applicable law or agreed to in writing, software 10f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# distributed under the License is distributed on an "AS IS" BASIS, 11f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# See the License for the specific language governing permissions and 13f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# limitations under the License. 14f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# ============================================================================== 15f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet"""Utilities for text input preprocessing. 16f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 17f49f801276154d0f693c5d57db6977a7eb32f017Francois CholletMay benefit from a fast Cython rewrite. 18f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet""" 19f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletfrom __future__ import absolute_import 20f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletfrom __future__ import division 21f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletfrom __future__ import print_function 22f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 23d21bf7d7502f447e5f967a479282b32b5845ba8bFrancois Cholletfrom collections import OrderedDict 2424101b35f3baebbfff3d8057ac223b325bc415ceFrancois Cholletfrom hashlib import md5 25f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletimport string 26f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletimport sys 27f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 28f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletimport numpy as np 29f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletfrom six.moves import range # pylint: disable=redefined-builtin 30f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletfrom six.moves import zip # pylint: disable=redefined-builtin 31f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 32f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletif sys.version_info < (3,): 33f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet maketrans = string.maketrans 34f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletelse: 35f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet maketrans = str.maketrans 36f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 37f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 38f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletdef text_to_word_sequence(text, 39f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 40f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet lower=True, 41f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet split=' '): 42d21bf7d7502f447e5f967a479282b32b5845ba8bFrancois Chollet """Converts a text to a sequence of words (or tokens). 43f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 44f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Arguments: 45f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet text: Input text (string). 46f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet filters: Sequence of characters to filter out. 47f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet lower: Whether to convert the input to lowercase. 48f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet split: Sentence split marker (string). 49f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 50f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Returns: 51d21bf7d7502f447e5f967a479282b32b5845ba8bFrancois Chollet A list of words (or tokens). 52f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """ 53f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if lower: 54f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet text = text.lower() 55f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet text = text.translate(maketrans(filters, split * len(filters))) 56f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet seq = text.split(split) 57f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet return [i for i in seq if i] 58f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 59f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 60f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletdef one_hot(text, 61f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet n, 62f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 63f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet lower=True, 64f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet split=' '): 6524101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet return hashing_trick( 6624101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet text, n, hash_function=hash, filters=filters, lower=lower, split=split) 6724101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet 6824101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet 6924101b35f3baebbfff3d8057ac223b325bc415ceFrancois Cholletdef hashing_trick(text, 7024101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet n, 7124101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet hash_function=None, 7224101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 7324101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet lower=True, 7424101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet split=' '): 7524101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet """Converts a text to a sequence of indexes in a fixed-size hashing space. 7624101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet 7724101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet Arguments: 7824101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet text: Input text (string). 7924101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet n: Dimension of the hashing space. 8024101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet hash_function: if `None` uses python `hash` function, can be 'md5' or 8124101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet any function that takes in input a string and returns a int. 8224101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet Note that `hash` is not a stable hashing function, so 8324101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet it is not consistent across different runs, while 'md5' 8424101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet is a stable hashing function. 8524101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet filters: Sequence of characters to filter out. 8624101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet lower: Whether to convert the input to lowercase. 8724101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet split: Sentence split marker (string). 8824101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet 8924101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet Returns: 9024101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet A list of integer word indices (unicity non-guaranteed). 9124101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet 9224101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet `0` is a reserved index that won't be assigned to any word. 9324101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet 9424101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet Two or more words may be assigned to the same index, due to possible 9524101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet collisions by the hashing function. 9624101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet """ 9724101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet if hash_function is None: 9824101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet hash_function = hash 9924101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet elif hash_function == 'md5': 10024101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet hash_function = lambda w: int(md5(w.encode()).hexdigest(), 16) 10124101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet 102f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet seq = text_to_word_sequence(text, filters=filters, lower=lower, split=split) 10324101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet return [(hash_function(w) % (n - 1) + 1) for w in seq] 104f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 105f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 106f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletclass Tokenizer(object): 107f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """Text tokenization utility class. 108f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 109f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet This class allows to vectorize a text corpus, by turning each 110f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet text into either a sequence of integers (each integer being the index 111f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet of a token in a dictionary) or into a vector where the coefficient 112f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for each token could be binary, based on word count, based on tf-idf... 113f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 114f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Arguments: 115f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet num_words: the maximum number of words to keep, based 116f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet on word frequency. Only the most common `num_words` words will 117f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet be kept. 118f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet filters: a string where each element is a character that will be 119f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet filtered from the texts. The default is all punctuation, plus 120f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet tabs and line breaks, minus the `'` character. 121f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet lower: boolean. Whether to convert the texts to lowercase. 122f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet split: character or string to use for token splitting. 123d21bf7d7502f447e5f967a479282b32b5845ba8bFrancois Chollet char_level: if True, every character will be treated as a token. 124f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 125f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet By default, all punctuation is removed, turning the texts into 126f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet space-separated sequences of words 127f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet (words maybe include the `'` character). These sequences are then 128f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet split into lists of tokens. They will then be indexed or vectorized. 129f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 130f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet `0` is a reserved index that won't be assigned to any word. 131f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """ 132f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 133f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet def __init__(self, 134f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet num_words=None, 135f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 136f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet lower=True, 137f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet split=' ', 138d21bf7d7502f447e5f967a479282b32b5845ba8bFrancois Chollet char_level=False): 139d21bf7d7502f447e5f967a479282b32b5845ba8bFrancois Chollet self.word_counts = OrderedDict() 140f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.word_docs = {} 141f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.filters = filters 142f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.split = split 143f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.lower = lower 144f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.num_words = num_words 145f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.document_count = 0 146f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.char_level = char_level 147f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 148f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet def fit_on_texts(self, texts): 149f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """Updates internal vocabulary based on a list of texts. 150f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 151f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Required before using `texts_to_sequences` or `texts_to_matrix`. 152f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 153f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Arguments: 154f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet texts: can be a list of strings, 155f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet or a generator of strings (for memory-efficiency) 156f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """ 157f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.document_count = 0 158f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for text in texts: 159f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.document_count += 1 160f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet seq = text if self.char_level else text_to_word_sequence( 161f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet text, self.filters, self.lower, self.split) 162f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for w in seq: 163f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if w in self.word_counts: 164f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.word_counts[w] += 1 165f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet else: 166f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.word_counts[w] = 1 167f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for w in set(seq): 168f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if w in self.word_docs: 169f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.word_docs[w] += 1 170f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet else: 171f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.word_docs[w] = 1 172f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 173f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet wcounts = list(self.word_counts.items()) 174f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet wcounts.sort(key=lambda x: x[1], reverse=True) 175f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet sorted_voc = [wc[0] for wc in wcounts] 176f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet # note that index 0 is reserved, never assigned to an existing word 177f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.word_index = dict( 178f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1))))) 179f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 180f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.index_docs = {} 181f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for w, c in list(self.word_docs.items()): 182f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.index_docs[self.word_index[w]] = c 183f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 184f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet def fit_on_sequences(self, sequences): 185f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """Updates internal vocabulary based on a list of sequences. 186f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 187f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Required before using `sequences_to_matrix` 188f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet (if `fit_on_texts` was never called). 189f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 190f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Arguments: 191f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet sequences: A list of sequence. 192f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet A "sequence" is a list of integer word indices. 193f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """ 194f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.document_count = len(sequences) 195f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.index_docs = {} 196f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for seq in sequences: 197f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet seq = set(seq) 198f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for i in seq: 199f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if i not in self.index_docs: 200f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.index_docs[i] = 1 201f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet else: 202f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet self.index_docs[i] += 1 203f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 204f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet def texts_to_sequences(self, texts): 205f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """Transforms each text in texts in a sequence of integers. 206f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 207f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Only top "num_words" most frequent words will be taken into account. 208f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Only words known by the tokenizer will be taken into account. 209f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 210f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Arguments: 211f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet texts: A list of texts (strings). 212f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 213f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Returns: 214f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet A list of sequences. 215f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """ 216f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet res = [] 217f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for vect in self.texts_to_sequences_generator(texts): 218f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet res.append(vect) 219f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet return res 220f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 221f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet def texts_to_sequences_generator(self, texts): 222f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """Transforms each text in texts in a sequence of integers. 223f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 224f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Only top "num_words" most frequent words will be taken into account. 225f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Only words known by the tokenizer will be taken into account. 226f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 227f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Arguments: 228f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet texts: A list of texts (strings). 229f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 230f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Yields: 231f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Yields individual sequences. 232f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """ 233f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet num_words = self.num_words 234f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for text in texts: 235f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet seq = text if self.char_level else text_to_word_sequence( 236f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet text, self.filters, self.lower, self.split) 237f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet vect = [] 238f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for w in seq: 239f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet i = self.word_index.get(w) 240f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if i is not None: 241f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if num_words and i >= num_words: 242f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet continue 243f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet else: 244f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet vect.append(i) 245f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet yield vect 246f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 247f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet def texts_to_matrix(self, texts, mode='binary'): 248f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """Convert a list of texts to a Numpy matrix. 249f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 250f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Arguments: 251f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet texts: list of strings. 252f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet mode: one of "binary", "count", "tfidf", "freq". 253f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 254f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Returns: 255f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet A Numpy matrix. 256f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """ 257f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet sequences = self.texts_to_sequences(texts) 258f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet return self.sequences_to_matrix(sequences, mode=mode) 259f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 260f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet def sequences_to_matrix(self, sequences, mode='binary'): 261f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """Converts a list of sequences into a Numpy matrix. 262f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 263f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Arguments: 264f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet sequences: list of sequences 265f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet (a sequence is a list of integer word indices). 266f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet mode: one of "binary", "count", "tfidf", "freq" 267f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 268f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Returns: 269f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet A Numpy matrix. 270f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 271f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet Raises: 272f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet ValueError: In case of invalid `mode` argument, 273f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet or if the Tokenizer requires to be fit to sample data. 274f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet """ 275f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if not self.num_words: 276f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if self.word_index: 277f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet num_words = len(self.word_index) + 1 278f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet else: 279f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet raise ValueError('Specify a dimension (num_words argument), ' 280f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 'or fit on some text data first.') 281f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet else: 282f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet num_words = self.num_words 283f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 284f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if mode == 'tfidf' and not self.document_count: 285f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet raise ValueError('Fit the Tokenizer on some data ' 286f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 'before using tfidf mode.') 287f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet 288f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet x = np.zeros((len(sequences), num_words)) 289f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for i, seq in enumerate(sequences): 290f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if not seq: 291f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet continue 292f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet counts = {} 293f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for j in seq: 294f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if j >= num_words: 295f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet continue 296f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if j not in counts: 297f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet counts[j] = 1. 298f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet else: 299f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet counts[j] += 1 300f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet for j, c in list(counts.items()): 301f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet if mode == 'count': 302f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet x[i][j] = c 303f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet elif mode == 'freq': 304f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet x[i][j] = c / len(seq) 305f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet elif mode == 'binary': 306f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet x[i][j] = 1 307f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet elif mode == 'tfidf': 308f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet # Use weighting scheme 2 in 309f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet # https://en.wikipedia.org/wiki/Tf%E2%80%93idf 310f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet tf = 1 + np.log(c) 311b8b8ebcf851df71ebb5209ae27d75e2befc50f0dFrancois Chollet idf = np.log(1 + self.document_count / 312b8b8ebcf851df71ebb5209ae27d75e2befc50f0dFrancois Chollet (1 + self.index_docs.get(j, 0))) 313f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet x[i][j] = tf * idf 314f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet else: 315f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet raise ValueError('Unknown vectorization mode:', mode) 316f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet return x 317