12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2008 The RE2 Authors.  All Rights Reserved.
22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style
32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file.
42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Unicode case folding tables.
62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The Unicode case folding tables encode the mapping from one Unicode point
82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// to the next largest Unicode point with equivalent folding.  The largest
92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// point wraps back to the first.  For example, the tables map:
102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//     'A' -> 'a'
122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//     'a' -> 'A'
132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//     'K' -> 'k'
152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//     'k' -> 'K'  (Kelvin symbol)
162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//     'K' -> 'K'
172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Like everything Unicode, these tables are big.  If we represent the table
192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Most table entries look like the ones around them:
212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Instead of listing all the pairs explicitly, we make a list of ranges
232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and deltas, so that the table entries for 'A' through 'Z' can be represented
242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// as a single entry { 'A', 'Z', +32 }.
252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// In addition to blocks that map to each other (A-Z mapping to a-z)
272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// there are blocks of pairs that individually map to each other
282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// For those, the special delta value EvenOdd marks even/odd pairs
302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// In this form, the table has 274 entries, about 3kB.  If we were to split
332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the table into one for 16-bit codes and an overflow table for larger ones,
342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// we could get it down to about 1.5kB, but that's not worth the complexity.
352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The grouped form also allows for efficient fold range calculations
372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// rather than looping one character at a time.
382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#ifndef RE2_UNICODE_CASEFOLD_H__
402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#define RE2_UNICODE_CASEFOLD_H__
412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/util.h"
432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 {
452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonenum {
472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  EvenOdd = 1,
482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  OddEven = -1,
492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  EvenOddSkip = 1<<30,
502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  OddEvenSkip,
512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstruct CaseFold {
542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint32 lo;
552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint32 hi;
562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int32 delta;
572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonextern CaseFold unicode_casefold[];
602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonextern int num_unicode_casefold;
612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonextern CaseFold unicode_tolower[];
632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonextern int num_unicode_tolower;
642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns the CaseFold* in the tables that contains rune.
662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If rune is not in the tables, returns the first CaseFold* after rune.
672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If rune is larger than any value in the tables, returns NULL.
682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonextern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune);
692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns the result of applying the fold f to the rune r.
712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonextern Rune ApplyFold(CaseFold *f, Rune r);
722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}  // namespace re2
742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#endif  // RE2_UNICODE_CASEFOLD_H__
76