unicode_groups.h revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Unicode character groups.
6
7// The codes get split into ranges of 16-bit codes
8// and ranges of 32-bit codes.  It would be simpler
9// to use only 32-bit ranges, but these tables are large
10// enough to warrant extra care.
11//
12// Using just 32-bit ranges gives 27 kB of data.
13// Adding 16-bit ranges gives 18 kB of data.
14// Adding an extra table of 16-bit singletons would reduce
15// to 16.5 kB of data but make the data harder to use;
16// we don't bother.
17
18#ifndef RE2_UNICODE_GROUPS_H__
19#define RE2_UNICODE_GROUPS_H__
20
21#include "util/util.h"
22
23namespace re2 {
24
25struct URange16
26{
27  uint16 lo;
28  uint16 hi;
29};
30
31struct URange32
32{
33  uint32 lo;
34  uint32 hi;
35};
36
37struct UGroup
38{
39  const char *name;
40  int sign;  // +1 for [abc], -1 for [^abc]
41  URange16 *r16;
42  int nr16;
43  URange32 *r32;
44  int nr32;
45};
46
47// Named by property or script name (e.g., "Nd", "N", "Han").
48// Negated groups are not included.
49extern UGroup unicode_groups[];
50extern int num_unicode_groups;
51
52// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
53// Negated groups are included.
54extern UGroup posix_groups[];
55extern int num_posix_groups;
56
57// Named by Perl name (e.g., "\\d", "\\D").
58// Negated groups are included.
59extern UGroup perl_groups[];
60extern int num_perl_groups;
61
62}  // namespace re2
63
64#endif  // RE2_UNICODE_GROUPS_H__
65