text.py revision 24101b35f3baebbfff3d8057ac223b325bc415ce
1f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet#
3f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# Licensed under the Apache License, Version 2.0 (the "License");
4f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# you may not use this file except in compliance with the License.
5f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# You may obtain a copy of the License at
6f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet#
7f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet#     http://www.apache.org/licenses/LICENSE-2.0
8f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet#
9f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# Unless required by applicable law or agreed to in writing, software
10f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# distributed under the License is distributed on an "AS IS" BASIS,
11f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# See the License for the specific language governing permissions and
13f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# limitations under the License.
14f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet# ==============================================================================
15f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet"""Utilities for text input preprocessing.
16f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
17f49f801276154d0f693c5d57db6977a7eb32f017Francois CholletMay benefit from a fast Cython rewrite.
18f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet"""
19f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletfrom __future__ import absolute_import
20f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletfrom __future__ import division
21f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletfrom __future__ import print_function
22f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
23d21bf7d7502f447e5f967a479282b32b5845ba8bFrancois Cholletfrom collections import OrderedDict
2424101b35f3baebbfff3d8057ac223b325bc415ceFrancois Cholletfrom hashlib import md5
25f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletimport string
26f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletimport sys
27f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
28f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletimport numpy as np
29f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletfrom six.moves import range  # pylint: disable=redefined-builtin
30f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletfrom six.moves import zip  # pylint: disable=redefined-builtin
31f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
32f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletif sys.version_info < (3,):
33f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  maketrans = string.maketrans
34f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletelse:
35f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  maketrans = str.maketrans
36f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
37f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
38f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletdef text_to_word_sequence(text,
39f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
40f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet                          lower=True,
41f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet                          split=' '):
42d21bf7d7502f447e5f967a479282b32b5845ba8bFrancois Chollet  """Converts a text to a sequence of words (or tokens).
43f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
44f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  Arguments:
45f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      text: Input text (string).
46f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      filters: Sequence of characters to filter out.
47f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      lower: Whether to convert the input to lowercase.
48f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      split: Sentence split marker (string).
49f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
50f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  Returns:
51d21bf7d7502f447e5f967a479282b32b5845ba8bFrancois Chollet      A list of words (or tokens).
52f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  """
53f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  if lower:
54f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    text = text.lower()
55f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  text = text.translate(maketrans(filters, split * len(filters)))
56f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  seq = text.split(split)
57f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  return [i for i in seq if i]
58f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
59f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
60f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletdef one_hot(text,
61f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet            n,
62f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
63f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet            lower=True,
64f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet            split=' '):
6524101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet  return hashing_trick(
6624101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet      text, n, hash_function=hash, filters=filters, lower=lower, split=split)
6724101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet
6824101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet
6924101b35f3baebbfff3d8057ac223b325bc415ceFrancois Cholletdef hashing_trick(text,
7024101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet                  n,
7124101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet                  hash_function=None,
7224101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
7324101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet                  lower=True,
7424101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet                  split=' '):
7524101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet  """Converts a text to a sequence of indexes in a fixed-size hashing space.
7624101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet
7724101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet  Arguments:
7824101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet      text: Input text (string).
7924101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet      n: Dimension of the hashing space.
8024101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet      hash_function: if `None` uses python `hash` function, can be 'md5' or
8124101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet          any function that takes in input a string and returns a int.
8224101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet          Note that `hash` is not a stable hashing function, so
8324101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet          it is not consistent across different runs, while 'md5'
8424101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet          is a stable hashing function.
8524101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet      filters: Sequence of characters to filter out.
8624101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet      lower: Whether to convert the input to lowercase.
8724101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet      split: Sentence split marker (string).
8824101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet
8924101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet  Returns:
9024101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet      A list of integer word indices (unicity non-guaranteed).
9124101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet
9224101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet  `0` is a reserved index that won't be assigned to any word.
9324101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet
9424101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet  Two or more words may be assigned to the same index, due to possible
9524101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet  collisions by the hashing function.
9624101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet  """
9724101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet  if hash_function is None:
9824101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet    hash_function = hash
9924101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet  elif hash_function == 'md5':
10024101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet    hash_function = lambda w: int(md5(w.encode()).hexdigest(), 16)
10124101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet
102f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  seq = text_to_word_sequence(text, filters=filters, lower=lower, split=split)
10324101b35f3baebbfff3d8057ac223b325bc415ceFrancois Chollet  return [(hash_function(w) % (n - 1) + 1) for w in seq]
104f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
105f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
106f49f801276154d0f693c5d57db6977a7eb32f017Francois Cholletclass Tokenizer(object):
107f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  """Text tokenization utility class.
108f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
109f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  This class allows to vectorize a text corpus, by turning each
110f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  text into either a sequence of integers (each integer being the index
111f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  of a token in a dictionary) or into a vector where the coefficient
112f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  for each token could be binary, based on word count, based on tf-idf...
113f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
114f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  Arguments:
115f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      num_words: the maximum number of words to keep, based
116f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          on word frequency. Only the most common `num_words` words will
117f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          be kept.
118f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      filters: a string where each element is a character that will be
119f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          filtered from the texts. The default is all punctuation, plus
120f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          tabs and line breaks, minus the `'` character.
121f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      lower: boolean. Whether to convert the texts to lowercase.
122f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      split: character or string to use for token splitting.
123d21bf7d7502f447e5f967a479282b32b5845ba8bFrancois Chollet      char_level: if True, every character will be treated as a token.
124f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
125f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  By default, all punctuation is removed, turning the texts into
126f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  space-separated sequences of words
127f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  (words maybe include the `'` character). These sequences are then
128f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  split into lists of tokens. They will then be indexed or vectorized.
129f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
130f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  `0` is a reserved index that won't be assigned to any word.
131f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  """
132f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
133f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  def __init__(self,
134f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet               num_words=None,
135f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
136f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet               lower=True,
137f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet               split=' ',
138d21bf7d7502f447e5f967a479282b32b5845ba8bFrancois Chollet               char_level=False):
139d21bf7d7502f447e5f967a479282b32b5845ba8bFrancois Chollet    self.word_counts = OrderedDict()
140f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    self.word_docs = {}
141f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    self.filters = filters
142f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    self.split = split
143f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    self.lower = lower
144f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    self.num_words = num_words
145f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    self.document_count = 0
146f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    self.char_level = char_level
147f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
148f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  def fit_on_texts(self, texts):
149f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    """Updates internal vocabulary based on a list of texts.
150f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
151f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Required before using `texts_to_sequences` or `texts_to_matrix`.
152f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
153f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Arguments:
154f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        texts: can be a list of strings,
155f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet            or a generator of strings (for memory-efficiency)
156f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    """
157f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    self.document_count = 0
158f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    for text in texts:
159f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      self.document_count += 1
160f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      seq = text if self.char_level else text_to_word_sequence(
161f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          text, self.filters, self.lower, self.split)
162f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      for w in seq:
163f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        if w in self.word_counts:
164f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          self.word_counts[w] += 1
165f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        else:
166f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          self.word_counts[w] = 1
167f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      for w in set(seq):
168f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        if w in self.word_docs:
169f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          self.word_docs[w] += 1
170f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        else:
171f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          self.word_docs[w] = 1
172f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
173f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    wcounts = list(self.word_counts.items())
174f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    wcounts.sort(key=lambda x: x[1], reverse=True)
175f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    sorted_voc = [wc[0] for wc in wcounts]
176f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    # note that index 0 is reserved, never assigned to an existing word
177f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    self.word_index = dict(
178f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
179f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
180f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    self.index_docs = {}
181f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    for w, c in list(self.word_docs.items()):
182f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      self.index_docs[self.word_index[w]] = c
183f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
184f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  def fit_on_sequences(self, sequences):
185f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    """Updates internal vocabulary based on a list of sequences.
186f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
187f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Required before using `sequences_to_matrix`
188f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    (if `fit_on_texts` was never called).
189f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
190f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Arguments:
191f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        sequences: A list of sequence.
192f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet            A "sequence" is a list of integer word indices.
193f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    """
194f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    self.document_count = len(sequences)
195f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    self.index_docs = {}
196f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    for seq in sequences:
197f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      seq = set(seq)
198f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      for i in seq:
199f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        if i not in self.index_docs:
200f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          self.index_docs[i] = 1
201f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        else:
202f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          self.index_docs[i] += 1
203f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
204f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  def texts_to_sequences(self, texts):
205f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    """Transforms each text in texts in a sequence of integers.
206f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
207f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Only top "num_words" most frequent words will be taken into account.
208f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Only words known by the tokenizer will be taken into account.
209f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
210f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Arguments:
211f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        texts: A list of texts (strings).
212f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
213f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Returns:
214f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        A list of sequences.
215f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    """
216f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    res = []
217f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    for vect in self.texts_to_sequences_generator(texts):
218f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      res.append(vect)
219f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    return res
220f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
221f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  def texts_to_sequences_generator(self, texts):
222f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    """Transforms each text in texts in a sequence of integers.
223f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
224f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Only top "num_words" most frequent words will be taken into account.
225f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Only words known by the tokenizer will be taken into account.
226f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
227f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Arguments:
228f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        texts: A list of texts (strings).
229f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
230f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Yields:
231f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        Yields individual sequences.
232f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    """
233f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    num_words = self.num_words
234f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    for text in texts:
235f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      seq = text if self.char_level else text_to_word_sequence(
236f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          text, self.filters, self.lower, self.split)
237f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      vect = []
238f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      for w in seq:
239f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        i = self.word_index.get(w)
240f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        if i is not None:
241f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          if num_words and i >= num_words:
242f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet            continue
243f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          else:
244f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet            vect.append(i)
245f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      yield vect
246f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
247f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  def texts_to_matrix(self, texts, mode='binary'):
248f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    """Convert a list of texts to a Numpy matrix.
249f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
250f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Arguments:
251f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        texts: list of strings.
252f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        mode: one of "binary", "count", "tfidf", "freq".
253f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
254f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Returns:
255f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        A Numpy matrix.
256f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    """
257f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    sequences = self.texts_to_sequences(texts)
258f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    return self.sequences_to_matrix(sequences, mode=mode)
259f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
260f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet  def sequences_to_matrix(self, sequences, mode='binary'):
261f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    """Converts a list of sequences into a Numpy matrix.
262f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
263f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Arguments:
264f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        sequences: list of sequences
265f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet            (a sequence is a list of integer word indices).
266f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        mode: one of "binary", "count", "tfidf", "freq"
267f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
268f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Returns:
269f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        A Numpy matrix.
270f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
271f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    Raises:
272f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        ValueError: In case of invalid `mode` argument,
273f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet            or if the Tokenizer requires to be fit to sample data.
274f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    """
275f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    if not self.num_words:
276f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      if self.word_index:
277f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        num_words = len(self.word_index) + 1
278f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      else:
279f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        raise ValueError('Specify a dimension (num_words argument), '
280f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet                         'or fit on some text data first.')
281f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    else:
282f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      num_words = self.num_words
283f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
284f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    if mode == 'tfidf' and not self.document_count:
285f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      raise ValueError('Fit the Tokenizer on some data '
286f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet                       'before using tfidf mode.')
287f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet
288f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    x = np.zeros((len(sequences), num_words))
289f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    for i, seq in enumerate(sequences):
290f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      if not seq:
291f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        continue
292f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      counts = {}
293f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      for j in seq:
294f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        if j >= num_words:
295f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          continue
296f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        if j not in counts:
297f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          counts[j] = 1.
298f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        else:
299f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          counts[j] += 1
300f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet      for j, c in list(counts.items()):
301f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        if mode == 'count':
302f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          x[i][j] = c
303f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        elif mode == 'freq':
304f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          x[i][j] = c / len(seq)
305f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        elif mode == 'binary':
306f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          x[i][j] = 1
307f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        elif mode == 'tfidf':
308f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          # Use weighting scheme 2 in
309f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          # https://en.wikipedia.org/wiki/Tf%E2%80%93idf
310f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          tf = 1 + np.log(c)
311b8b8ebcf851df71ebb5209ae27d75e2befc50f0dFrancois Chollet          idf = np.log(1 + self.document_count /
312b8b8ebcf851df71ebb5209ae27d75e2befc50f0dFrancois Chollet                       (1 + self.index_docs.get(j, 0)))
313f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          x[i][j] = tf * idf
314f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet        else:
315f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet          raise ValueError('Unknown vectorization mode:', mode)
316f49f801276154d0f693c5d57db6977a7eb32f017Francois Chollet    return x
317