15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#!/usr/bin/python
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Copyright 2008 The RE2 Authors.  All Rights Reserved.
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Use of this source code is governed by a BSD-style
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# license that can be found in the LICENSE file.
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"""Generate C++ tables for Unicode Script and Category groups."""
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)import sys
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)import unicode
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)_header = """
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make_unicode_groups.py >unicode_groups.cc
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/unicode_groups.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 {
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"""
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)_trailer = """
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace re2
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"""
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)n16 = 0
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)n32 = 0
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)def MakeRanges(codes):
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ranges = []
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  last = -100
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for c in codes:
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if c == last+1:
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ranges[-1][1] = c
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    else:
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ranges.append([c, c])
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    last = c
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return ranges
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)def PrintRanges(type, name, ranges):
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """Print the ranges as an array of type named name."""
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  print "static %s %s[] = {" % (type, name,)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for lo, hi in ranges:
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    print "\t{ %d, %d }," % (lo, hi)
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  print "};"
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# def PrintCodes(type, name, codes):
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#   """Print the codes as an array of type named name."""
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#   print "static %s %s[] = {" % (type, name,)
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#   for c in codes:
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#     print "\t%d," % (c,)
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#   print "};"
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)def PrintGroup(name, codes):
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """Print the data structures for the group of codes.
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Return a UGroup literal for the group."""
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # See unicode_groups.h for a description of the data structure.
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Split codes into 16-bit ranges and 32-bit ranges.
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  range16 = MakeRanges([c for c in codes if c < 65536])
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  range32 = MakeRanges([c for c in codes if c >= 65536])
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Pull singleton ranges out of range16.
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # code16 = [lo for lo, hi in range16 if lo == hi]
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  global n16
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  global n32
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  n16 += len(range16)
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  n32 += len(range32)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ugroup = "{ \"%s\", +1" % (name,)
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # if len(code16) > 0:
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  #   PrintCodes("uint16", name+"_code16", code16)
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  #   ugroup += ", %s_code16, %d" % (name, len(code16))
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # else:
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  #   ugroup += ", 0, 0"
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if len(range16) > 0:
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    PrintRanges("URange16", name+"_range16", range16)
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ugroup += ", %s_range16, %d" % (name, len(range16))
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  else:
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ugroup += ", 0, 0"
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if len(range32) > 0:
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    PrintRanges("URange32", name+"_range32", range32)
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ugroup += ", %s_range32, %d" % (name, len(range32))
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  else:
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ugroup += ", 0, 0"
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ugroup += " }"
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return ugroup
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)def main():
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  print _header
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ugroups = []
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for name, codes in unicode.Categories().iteritems():
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ugroups.append(PrintGroup(name, codes))
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for name, codes in unicode.Scripts().iteritems():
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ugroups.append(PrintGroup(name, codes))
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  print "UGroup unicode_groups[] = {";
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ugroups.sort()
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for ug in ugroups:
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    print "\t%s," % (ug,)
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  print "};"
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  print "int num_unicode_groups = %d;" % (len(ugroups),)
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  print _trailer
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)if __name__ == '__main__':
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  main()
112