1// Copyright 2008 The RE2 Authors. All Rights Reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Unicode case folding tables. 6 7// The Unicode case folding tables encode the mapping from one Unicode point 8// to the next largest Unicode point with equivalent folding. The largest 9// point wraps back to the first. For example, the tables map: 10// 11// 'A' -> 'a' 12// 'a' -> 'A' 13// 14// 'K' -> 'k' 15// 'k' -> 'âª' (Kelvin symbol) 16// 'âª' -> 'K' 17// 18// Like everything Unicode, these tables are big. If we represent the table 19// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB. 20// Most table entries look like the ones around them: 21// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. 22// Instead of listing all the pairs explicitly, we make a list of ranges 23// and deltas, so that the table entries for 'A' through 'Z' can be represented 24// as a single entry { 'A', 'Z', +32 }. 25// 26// In addition to blocks that map to each other (A-Z mapping to a-z) 27// there are blocks of pairs that individually map to each other 28// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...). 29// For those, the special delta value EvenOdd marks even/odd pairs 30// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs. 31// 32// In this form, the table has 274 entries, about 3kB. If we were to split 33// the table into one for 16-bit codes and an overflow table for larger ones, 34// we could get it down to about 1.5kB, but that's not worth the complexity. 35// 36// The grouped form also allows for efficient fold range calculations 37// rather than looping one character at a time. 38 39#ifndef RE2_UNICODE_CASEFOLD_H__ 40#define RE2_UNICODE_CASEFOLD_H__ 41 42#include "util/util.h" 43 44namespace re2 { 45 46enum { 47 EvenOdd = 1, 48 OddEven = -1, 49 EvenOddSkip = 1<<30, 50 OddEvenSkip, 51}; 52 53struct CaseFold { 54 uint32 lo; 55 uint32 hi; 56 int32 delta; 57}; 58 59extern CaseFold unicode_casefold[]; 60extern int num_unicode_casefold; 61 62extern CaseFold unicode_tolower[]; 63extern int num_unicode_tolower; 64 65// Returns the CaseFold* in the tables that contains rune. 66// If rune is not in the tables, returns the first CaseFold* after rune. 67// If rune is larger than any value in the tables, returns NULL. 68extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune); 69 70// Returns the result of applying the fold f to the rune r. 71extern Rune ApplyFold(CaseFold *f, Rune r); 72 73} // namespace re2 74 75#endif // RE2_UNICODE_CASEFOLD_H__ 76