1// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Unicode case folding tables.
6
7// The Unicode case folding tables encode the mapping from one Unicode point
8// to the next largest Unicode point with equivalent folding.  The largest
9// point wraps back to the first.  For example, the tables map:
10//
11//     'A' -> 'a'
12//     'a' -> 'A'
13//
14//     'K' -> 'k'
15//     'k' -> 'K'  (Kelvin symbol)
16//     'K' -> 'K'
17//
18// Like everything Unicode, these tables are big.  If we represent the table
19// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
20// Most table entries look like the ones around them:
21// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
22// Instead of listing all the pairs explicitly, we make a list of ranges
23// and deltas, so that the table entries for 'A' through 'Z' can be represented
24// as a single entry { 'A', 'Z', +32 }.
25//
26// In addition to blocks that map to each other (A-Z mapping to a-z)
27// there are blocks of pairs that individually map to each other
28// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
29// For those, the special delta value EvenOdd marks even/odd pairs
30// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
31//
32// In this form, the table has 274 entries, about 3kB.  If we were to split
33// the table into one for 16-bit codes and an overflow table for larger ones,
34// we could get it down to about 1.5kB, but that's not worth the complexity.
35//
36// The grouped form also allows for efficient fold range calculations
37// rather than looping one character at a time.
38
39#ifndef RE2_UNICODE_CASEFOLD_H__
40#define RE2_UNICODE_CASEFOLD_H__
41
42#include "util/util.h"
43
44namespace re2 {
45
46enum {
47  EvenOdd = 1,
48  OddEven = -1,
49  EvenOddSkip = 1<<30,
50  OddEvenSkip,
51};
52
53struct CaseFold {
54  uint32 lo;
55  uint32 hi;
56  int32 delta;
57};
58
59extern CaseFold unicode_casefold[];
60extern int num_unicode_casefold;
61
62extern CaseFold unicode_tolower[];
63extern int num_unicode_tolower;
64
65// Returns the CaseFold* in the tables that contains rune.
66// If rune is not in the tables, returns the first CaseFold* after rune.
67// If rune is larger than any value in the tables, returns NULL.
68extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune);
69
70// Returns the result of applying the fold f to the rune r.
71extern Rune ApplyFold(CaseFold *f, Rune r);
72
73}  // namespace re2
74
75#endif  // RE2_UNICODE_CASEFOLD_H__
76