make_unicode_groups.py revision 5821806d5e7f356e8fa4b058a389a808ea183019
1#!/usr/bin/python
2# Copyright 2008 The RE2 Authors.  All Rights Reserved.
3# Use of this source code is governed by a BSD-style
4# license that can be found in the LICENSE file.
5
6"""Generate C++ tables for Unicode Script and Category groups."""
7
8import sys
9import unicode
10
11_header = """
12// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
13// make_unicode_groups.py >unicode_groups.cc
14
15#include "re2/unicode_groups.h"
16
17namespace re2 {
18
19"""
20
21_trailer = """
22
23}  // namespace re2
24
25"""
26
27n16 = 0
28n32 = 0
29
30def MakeRanges(codes):
31  """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
32  ranges = []
33  last = -100
34  for c in codes:
35    if c == last+1:
36      ranges[-1][1] = c
37    else:
38      ranges.append([c, c])
39    last = c
40  return ranges
41
42def PrintRanges(type, name, ranges):
43  """Print the ranges as an array of type named name."""
44  print "static %s %s[] = {" % (type, name,)
45  for lo, hi in ranges:
46    print "\t{ %d, %d }," % (lo, hi)
47  print "};"
48
49# def PrintCodes(type, name, codes):
50#   """Print the codes as an array of type named name."""
51#   print "static %s %s[] = {" % (type, name,)
52#   for c in codes:
53#     print "\t%d," % (c,)
54#   print "};"
55
56def PrintGroup(name, codes):
57  """Print the data structures for the group of codes.
58  Return a UGroup literal for the group."""
59
60  # See unicode_groups.h for a description of the data structure.
61
62  # Split codes into 16-bit ranges and 32-bit ranges.
63  range16 = MakeRanges([c for c in codes if c < 65536])
64  range32 = MakeRanges([c for c in codes if c >= 65536])
65
66  # Pull singleton ranges out of range16.
67  # code16 = [lo for lo, hi in range16 if lo == hi]
68  # range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
69
70  global n16
71  global n32
72  n16 += len(range16)
73  n32 += len(range32)
74
75  ugroup = "{ \"%s\", +1" % (name,)
76  # if len(code16) > 0:
77  #   PrintCodes("uint16", name+"_code16", code16)
78  #   ugroup += ", %s_code16, %d" % (name, len(code16))
79  # else:
80  #   ugroup += ", 0, 0"
81  if len(range16) > 0:
82    PrintRanges("URange16", name+"_range16", range16)
83    ugroup += ", %s_range16, %d" % (name, len(range16))
84  else:
85    ugroup += ", 0, 0"
86  if len(range32) > 0:
87    PrintRanges("URange32", name+"_range32", range32)
88    ugroup += ", %s_range32, %d" % (name, len(range32))
89  else:
90    ugroup += ", 0, 0"
91  ugroup += " }"
92  return ugroup
93
94def main():
95  print _header
96  ugroups = []
97  for name, codes in unicode.Categories().iteritems():
98    ugroups.append(PrintGroup(name, codes))
99  for name, codes in unicode.Scripts().iteritems():
100    ugroups.append(PrintGroup(name, codes))
101  print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
102  print "UGroup unicode_groups[] = {";
103  ugroups.sort()
104  for ug in ugroups:
105    print "\t%s," % (ug,)
106  print "};"
107  print "int num_unicode_groups = %d;" % (len(ugroups),)
108  print _trailer
109
110if __name__ == '__main__':
111  main()
112