15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2008 The RE2 Authors.  All Rights Reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Unicode case folding tables.
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The Unicode case folding tables encode the mapping from one Unicode point
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to the next largest Unicode point with equivalent folding.  The largest
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// point wraps back to the first.  For example, the tables map:
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     'A' -> 'a'
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     'a' -> 'A'
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     'K' -> 'k'
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     'k' -> 'K'  (Kelvin symbol)
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     'K' -> 'K'
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Like everything Unicode, these tables are big.  If we represent the table
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Most table entries look like the ones around them:
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Instead of listing all the pairs explicitly, we make a list of ranges
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and deltas, so that the table entries for 'A' through 'Z' can be represented
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// as a single entry { 'A', 'Z', +32 }.
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// In addition to blocks that map to each other (A-Z mapping to a-z)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// there are blocks of pairs that individually map to each other
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For those, the special delta value EvenOdd marks even/odd pairs
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// In this form, the table has 274 entries, about 3kB.  If we were to split
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the table into one for 16-bit codes and an overflow table for larger ones,
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// we could get it down to about 1.5kB, but that's not worth the complexity.
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The grouped form also allows for efficient fold range calculations
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// rather than looping one character at a time.
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef RE2_UNICODE_CASEFOLD_H__
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define RE2_UNICODE_CASEFOLD_H__
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/util.h"
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 {
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)enum {
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EvenOdd = 1,
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  OddEven = -1,
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EvenOddSkip = 1<<30,
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  OddEvenSkip,
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct CaseFold {
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 lo;
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 hi;
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int32 delta;
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern CaseFold unicode_casefold[];
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern int num_unicode_casefold;
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern CaseFold unicode_tolower[];
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern int num_unicode_tolower;
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns the CaseFold* in the tables that contains rune.
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If rune is not in the tables, returns the first CaseFold* after rune.
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If rune is larger than any value in the tables, returns NULL.
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune);
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns the result of applying the fold f to the rune r.
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern Rune ApplyFold(CaseFold *f, Rune r);
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace re2
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // RE2_UNICODE_CASEFOLD_H__
76