break_iterator.h revision ddb351dbec246cf1fab5ec20d2d5520909041de1
1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef BASE_I18N_BREAK_ITERATOR_H_
6#define BASE_I18N_BREAK_ITERATOR_H_
7#pragma once
8
9#include "base/basictypes.h"
10#include "base/string16.h"
11
12// The BreakIterator class iterates through the words, word breaks, and
13// line breaks in a UTF-16 string.
14//
15// It provides several modes, BREAK_WORD, BREAK_LINE, and BREAK_NEWLINE,
16// which modify how characters are aggregated into the returned string.
17//
18// Under BREAK_WORD mode, once a word is encountered any non-word
19// characters are not included in the returned string (e.g. in the
20// UTF-16 equivalent of the string " foo bar! ", the word breaks are at
21// the periods in ". .foo. .bar.!. .").
22// Note that Chinese/Japanese/Thai do not use spaces between words so that
23// boundaries can fall in the middle of a continuous run of non-space /
24// non-punctuation characters.
25//
26// Under BREAK_LINE mode, once a line breaking opportunity is encountered,
27// any non-word  characters are included in the returned string, breaking
28// only when a space-equivalent character or a line breaking opportunity
29// is encountered (e.g. in the UTF16-equivalent of the string " foo bar! ",
30// the breaks are at the periods in ". .foo .bar! .").
31//
32// Note that lines can be broken at any character/syllable/grapheme cluster
33// boundary in Chinese/Japanese/Korean and at word boundaries in Thai
34// (Thai does not use spaces between words). Therefore, this is NOT the same
35// as breaking only at space-equivalent characters where its former
36// name (BREAK_SPACE) implied.
37//
38// Under BREAK_NEWLINE mode, all characters are included in the returned
39// string, breking only when a newline-equivalent character is encountered
40// (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line
41// breaks are at the periods in ".foo\n.bar\n.\n.").
42//
43// To extract the words from a string, move a BREAK_WORD BreakIterator
44// through the string and test whether IsWord() is true.  E.g.,
45//   BreakIterator iter(&str, BreakIterator::BREAK_WORD);
46//   if (!iter.Init()) return false;
47//   while (iter.Advance()) {
48//     if (iter.IsWord()) {
49//       // region [iter.prev(),iter.pos()) contains a word.
50//       VLOG(1) << "word: " << iter.GetString();
51//     }
52//   }
53
54namespace base {
55
56class BreakIterator {
57 public:
58  enum BreakType {
59    BREAK_WORD,
60    BREAK_LINE,
61    // TODO(jshin): Remove this after reviewing call sites.
62    // If call sites really need break only on space-like characters
63    // implement it separately.
64    BREAK_SPACE = BREAK_LINE,
65    BREAK_NEWLINE,
66  };
67
68  // Requires |str| to live as long as the BreakIterator does.
69  BreakIterator(const string16* str, BreakType break_type);
70  ~BreakIterator();
71
72  // Init() must be called before any of the iterators are valid.
73  // Returns false if ICU failed to initialize.
74  bool Init();
75
76  // Return the current break position within the string,
77  // or BreakIterator::npos when done.
78  size_t pos() const { return pos_; }
79
80  // Return the value of pos() returned before Advance() was last called.
81  size_t prev() const { return prev_; }
82
83  // Advance to the next break.  Returns false if we've run past the end of
84  // the string.  (Note that the very last "break" is after the final
85  // character in the string, and when we advance to that position it's the
86  // last time Advance() returns true.)
87  bool Advance();
88
89  // Under BREAK_WORD mode, returns true if the break we just hit is the
90  // end of a word. (Otherwise, the break iterator just skipped over e.g.
91  // whitespace or punctuation.)  Under BREAK_LINE and BREAK_NEWLINE modes,
92  // this distinction doesn't apply and it always retuns false.
93  bool IsWord() const;
94
95  // Return the string between prev() and pos().
96  // Advance() must have been called successfully at least once
97  // for pos() to have advanced to somewhere useful.
98  string16 GetString() const;
99
100 private:
101  // ICU iterator, avoiding ICU ubrk.h dependence.
102  // This is actually an ICU UBreakiterator* type, which turns out to be
103  // a typedef for a void* in the ICU headers. Using void* directly prevents
104  // callers from needing access to the ICU public headers directory.
105  void* iter_;
106
107  // The string we're iterating over.
108  const string16* string_;
109
110  // The breaking style (word/space/newline).
111  BreakType break_type_;
112
113  // Previous and current iterator positions.
114  size_t prev_, pos_;
115
116  DISALLOW_COPY_AND_ASSIGN(BreakIterator);
117};
118
119}  // namespace base
120
121#endif  // BASE_I18N_BREAK_ITERATOR_H__
122