12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2008 The RE2 Authors. All Rights Reserved. 22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style 32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file. 42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Unicode case folding tables. 62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The Unicode case folding tables encode the mapping from one Unicode point 82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// to the next largest Unicode point with equivalent folding. The largest 92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// point wraps back to the first. For example, the tables map: 102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 'A' -> 'a' 122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 'a' -> 'A' 132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 'K' -> 'k' 152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 'k' -> 'K' (Kelvin symbol) 162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 'K' -> 'K' 172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Like everything Unicode, these tables are big. If we represent the table 192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB. 202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Most table entries look like the ones around them: 212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. 222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Instead of listing all the pairs explicitly, we make a list of ranges 232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and deltas, so that the table entries for 'A' through 'Z' can be represented 242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// as a single entry { 'A', 'Z', +32 }. 252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// In addition to blocks that map to each other (A-Z mapping to a-z) 272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// there are blocks of pairs that individually map to each other 282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...). 292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// For those, the special delta value EvenOdd marks even/odd pairs 302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs. 312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// In this form, the table has 274 entries, about 3kB. If we were to split 332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the table into one for 16-bit codes and an overflow table for larger ones, 342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// we could get it down to about 1.5kB, but that's not worth the complexity. 352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The grouped form also allows for efficient fold range calculations 372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// rather than looping one character at a time. 382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#ifndef RE2_UNICODE_CASEFOLD_H__ 402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#define RE2_UNICODE_CASEFOLD_H__ 412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/util.h" 432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 { 452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonenum { 472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson EvenOdd = 1, 482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson OddEven = -1, 492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson EvenOddSkip = 1<<30, 502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson OddEvenSkip, 512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstruct CaseFold { 542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint32 lo; 552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint32 hi; 562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int32 delta; 572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonextern CaseFold unicode_casefold[]; 602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonextern int num_unicode_casefold; 612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonextern CaseFold unicode_tolower[]; 632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonextern int num_unicode_tolower; 642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns the CaseFold* in the tables that contains rune. 662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If rune is not in the tables, returns the first CaseFold* after rune. 672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If rune is larger than any value in the tables, returns NULL. 682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonextern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune); 692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns the result of applying the fold f to the rune r. 712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonextern Rune ApplyFold(CaseFold *f, Rune r); 722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} // namespace re2 742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#endif // RE2_UNICODE_CASEFOLD_H__ 76