make_unicode_casefold.py revision 5821806d5e7f356e8fa4b058a389a808ea183019
1#!/usr/bin/python
2# coding=utf-8
3#
4# Copyright 2008 The RE2 Authors.  All Rights Reserved.
5# Use of this source code is governed by a BSD-style
6# license that can be found in the LICENSE file.
7
8# See unicode_casefold.h for description of case folding tables.
9
10"""Generate C++ table for Unicode case folding."""
11
12import unicode, sys
13
14_header = """
15// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
16// make_unicode_casefold.py >unicode_casefold.cc
17
18#include "re2/unicode_casefold.h"
19
20namespace re2 {
21
22"""
23
24_trailer = """
25
26} // namespace re2
27
28"""
29
30def _Delta(a, b):
31  """Compute the delta for b - a.  Even/odd and odd/even
32     are handled specially, as described above."""
33  if a+1 == b:
34    if a%2 == 0:
35      return 'EvenOdd'
36    else:
37      return 'OddEven'
38  if a == b+1:
39    if a%2 == 0:
40      return 'OddEven'
41    else:
42      return 'EvenOdd'
43  return b - a
44
45def _AddDelta(a, delta):
46  """Return a + delta, handling EvenOdd and OddEven specially."""
47  if type(delta) == int:
48    return a+delta
49  if delta == 'EvenOdd':
50    if a%2 == 0:
51      return a+1
52    else:
53      return a-1
54  if delta == 'OddEven':
55    if a%2 == 1:
56      return a+1
57    else:
58      return a-1
59  print >>sys.stderr, "Bad Delta: ", delta
60  raise "Bad Delta"
61
62def _MakeRanges(pairs):
63  """Turn a list like [(65,97), (66, 98), ..., (90,122)]
64     into [(65, 90, +32)]."""
65  ranges = []
66  last = -100
67
68  def evenodd(last, a, b, r):
69    if a != last+1 or b != _AddDelta(a, r[2]):
70      return False
71    r[1] = a
72    return True
73
74  def evenoddpair(last, a, b, r):
75    if a != last+2:
76      return False
77    delta = r[2]
78    d = delta
79    if type(delta) is not str:
80      return False
81    if delta.endswith('Skip'):
82      d = delta[:-4]
83    else:
84      delta = d + 'Skip'
85    if b != _AddDelta(a, d):
86      return False
87    r[1] = a
88    r[2] = delta
89    return True
90
91  for a, b in pairs:
92    if ranges and evenodd(last, a, b, ranges[-1]):
93      pass
94    elif ranges and evenoddpair(last, a, b, ranges[-1]):
95      pass
96    else:
97      ranges.append([a, a, _Delta(a, b)])
98    last = a
99  return ranges
100
101# The maximum size of a case-folding group.
102# Case folding is implemented in parse.cc by a recursive process
103# with a recursion depth equal to the size of the largest
104# case-folding group, so it is important that this bound be small.
105# The current tables have no group bigger than 4.
106# If there are ever groups bigger than 10 or so, it will be
107# time to rework the code in parse.cc.
108MaxCasefoldGroup = 4
109
110def main():
111  lowergroups, casegroups = unicode.CaseGroups()
112  foldpairs = []
113  seen = {}
114  for c in casegroups:
115    if len(c) > MaxCasefoldGroup:
116      raise unicode.Error("casefold group too long: %s" % (c,))
117    for i in range(len(c)):
118      if c[i-1] in seen:
119        raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
120      seen[c[i-1]] = True
121      foldpairs.append([c[i-1], c[i]])
122
123  lowerpairs = []
124  for lower, group in lowergroups.iteritems():
125    for g in group:
126      if g != lower:
127        lowerpairs.append([g, lower])
128
129  def printpairs(name, foldpairs):
130    foldpairs.sort()
131    foldranges = _MakeRanges(foldpairs)
132    print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
133    print "CaseFold unicode_%s[] = {" % (name,)
134    for lo, hi, delta in foldranges:
135      print "\t{ %d, %d, %s }," % (lo, hi, delta)
136    print "};"
137    print "int num_unicode_%s = %d;" % (name, len(foldranges),)
138    print ""
139
140  print _header
141  printpairs("casefold", foldpairs)
142  printpairs("tolower", lowerpairs)
143  print _trailer
144
145if __name__ == '__main__':
146  main()
147