15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2008 The RE2 Authors. All Rights Reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Unicode case folding tables. 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The Unicode case folding tables encode the mapping from one Unicode point 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to the next largest Unicode point with equivalent folding. The largest 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// point wraps back to the first. For example, the tables map: 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 'A' -> 'a' 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 'a' -> 'A' 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 'K' -> 'k' 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 'k' -> 'K' (Kelvin symbol) 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 'K' -> 'K' 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Like everything Unicode, these tables are big. If we represent the table 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB. 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Most table entries look like the ones around them: 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Instead of listing all the pairs explicitly, we make a list of ranges 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and deltas, so that the table entries for 'A' through 'Z' can be represented 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// as a single entry { 'A', 'Z', +32 }. 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// In addition to blocks that map to each other (A-Z mapping to a-z) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// there are blocks of pairs that individually map to each other 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...). 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For those, the special delta value EvenOdd marks even/odd pairs 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs. 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// In this form, the table has 274 entries, about 3kB. If we were to split 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the table into one for 16-bit codes and an overflow table for larger ones, 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// we could get it down to about 1.5kB, but that's not worth the complexity. 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The grouped form also allows for efficient fold range calculations 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// rather than looping one character at a time. 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef RE2_UNICODE_CASEFOLD_H__ 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define RE2_UNICODE_CASEFOLD_H__ 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/util.h" 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 { 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)enum { 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EvenOdd = 1, 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OddEven = -1, 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EvenOddSkip = 1<<30, 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OddEvenSkip, 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct CaseFold { 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 lo; 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 hi; 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32 delta; 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern CaseFold unicode_casefold[]; 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern int num_unicode_casefold; 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern CaseFold unicode_tolower[]; 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern int num_unicode_tolower; 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns the CaseFold* in the tables that contains rune. 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If rune is not in the tables, returns the first CaseFold* after rune. 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If rune is larger than any value in the tables, returns NULL. 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune); 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns the result of applying the fold f to the rune r. 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern Rune ApplyFold(CaseFold *f, Rune r); 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace re2 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // RE2_UNICODE_CASEFOLD_H__ 76