1#!/bin/perl -w
2#*******************************************************************
3# COPYRIGHT:
4# Copyright (c) 2002-2009, International Business Machines Corporation and
5# others. All Rights Reserved.
6#*******************************************************************
7
8# This script reads in UCD files PropertyAliases.txt and
9# PropertyValueAliases.txt and correlates them with ICU enums
10# defined in uchar.h and uscript.h.  It then outputs a header
11# file which contains all names and enums.  The header is included
12# by the genpname tool C++ source file, which produces the actual
13# binary data file.
14#
15# See usage note below.
16#
17# TODO: The Property[Value]Alias.txt files state that they can support
18# more than 2 names per property|value.  Currently (Unicode 3.2) there
19# are always 1 or 2 names.  If more names were supported, presumably
20# the format would be something like:
21#    nv        ; Numeric_Value
22#    nv        ; Value_Numerique
23# CURRENTLY, this script assumes that there are 1 or two names.  Any
24# duplicates it sees are flagged as an error.  If multiple aliases
25# appear in a future version of Unicode, modify this script to support
26# that.
27#
28# NOTE: As of ICU 2.6, this script has been modified to know about the
29# pseudo-property gcm/General_Category_Mask, which corresponds to the
30# uchar.h property UCHAR_GENERAL_CATEGORY_MASK.  This property
31# corresponds to General_Category but is a bitmask value.  It does not
32# exist in the UCD.  Therefore, I special case it in several places
33# (search for General_Category_Mask and gcm).
34#
35# NOTE: As of ICU 2.6, this script reads an auxiliary data file,
36# SyntheticPropertyAliases.txt, containing property aliases not
37# present in the UCD but present in ICU.  This file resides in the
38# same directory as this script.  Its contents are merged into those
39# of PropertyAliases.txt as if the two files were appended.
40#
41# NOTE: The following names are handled specially.  See script below
42# for details.
43#
44#   T/True
45#   F/False
46#   No_Block
47#
48# Author: Alan Liu
49# Created: October 14 2002
50# Since: ICU 2.4
51
52use FileHandle;
53use strict;
54use Dumpvalue;
55
56my $DEBUG = 1;
57my $DUMPER = new Dumpvalue;
58
59my $count = @ARGV;
60my $ICU_DIR = shift() || '';
61my $OUT_FILE = shift() || 'data.h';
62my $HEADER_DIR = "$ICU_DIR/source/common/unicode";
63my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
64
65# Get the current year from the system
66my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
67
68# Used to make "n/a" property [value] aliases (Unicode or Synthetic) unique
69my $propNA = 0;
70my $valueNA = 0;
71
72#----------------------------------------------------------------------
73# Top level property keys for binary, enumerated, string, and double props
74my @TOP     = qw( _bp _ep _sp _dp _mp );
75
76# This hash governs how top level properties are grouped into output arrays.
77#my %TOP_PROPS = ( "VALUED"   => [ '_bp', '_ep' ],
78#                  "NO_VALUE" => [ '_sp', '_dp' ] );m
79#my %TOP_PROPS = ( "BINARY"   => [ '_bp' ],
80#                  "ENUMERATED" => [ '_ep' ],
81#                  "STRING" => [ '_sp' ],
82#                  "DOUBLE" => [ '_dp' ] );
83my %TOP_PROPS = ( ""   => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] );
84
85my %PROP_TYPE = (Binary => "_bp",
86                 String => "_sp",
87                 Double => "_dp",
88                 Enumerated => "_ep",
89                 Bitmask => "_mp");
90#----------------------------------------------------------------------
91
92# Properties that are unsupported in ICU
93my %UNSUPPORTED = (Composition_Exclusion => 1,
94                   Decomposition_Mapping => 1,
95                   Expands_On_NFC => 1,
96                   Expands_On_NFD => 1,
97                   Expands_On_NFKC => 1,
98                   Expands_On_NFKD => 1,
99                   FC_NFKC_Closure => 1,
100                   ID_Start_Exceptions => 1,
101                   Special_Case_Condition => 1,
102                   );
103
104# Short names of properties that weren't seen in uchar.h.  If the
105# properties weren't seen, don't complain about the property values
106# missing.
107my %MISSING_FROM_UCHAR;
108
109# Additional property aliases beyond short and long names,
110# like space in addition to WSpace and White_Space in Unicode 4.1.
111# Hashtable, maps long name to alias.
112# For example, maps White_Space->space.
113#
114# If multiple additional aliases are defined,
115# then they are separated in the value string with '|'.
116# For example, White_Space->space|outer_space
117my %additional_property_aliases;
118
119#----------------------------------------------------------------------
120
121# Emitted class names
122my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property);
123
124if ($count < 1 || $count > 2 ||
125    !-d $HEADER_DIR ||
126    !-d $UNIDATA_DIR) {
127    my $me = $0;
128    $me =~ s|.+[/\\]||;
129    my $lm = ' ' x length($me);
130    print <<"END";
131
132$me: Reads ICU4C headers and Unicode data files and creates
133$lm  a C header file that is included by genpname.  The header
134$lm  file matches constants defined in the ICU4C headers with
135$lm  property|value aliases in the Unicode data files.
136
137Usage: $me <icu_dir> [<out_file>]
138
139<icu_dir>   ICU4C root directory, containing
140               source/common/unicode/uchar.h
141               source/common/unicode/uscript.h
142               source/data/unidata/Blocks.txt
143               source/data/unidata/PropertyAliases.txt
144               source/data/unidata/PropertyValueAliases.txt
145<out_file>  File name of header to be written;
146            default is 'data.h'.
147
148The Unicode versions of all input files must match.
149END
150    exit(1);
151}
152
153my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR);
154
155if ($DEBUG) {
156    print "Merged hash:\n";
157    for my $key (sort keys %$h) {
158        my $hh = $h->{$key};
159        for my $subkey (sort keys %$hh) {
160            print "$key:$subkey:", $hh->{$subkey}, "\n";
161        }
162    }
163}
164
165my $out = new FileHandle($OUT_FILE, 'w');
166die "Error: Can't write to $OUT_FILE: $!" unless (defined $out);
167my $save = select($out);
168formatData($h, $version);
169select($save);
170$out->close();
171
172exit(0);
173
174#----------------------------------------------------------------------
175# From PropList.html: "The properties of the form Other_XXX
176# are used to generate properties in DerivedCoreProperties.txt.
177# They are not intended for general use, such as in APIs that
178# return property values.
179# Non_Break is not a valid property as of 3.2.
180sub isIgnoredProperty {
181    local $_ = shift;
182    /^Other_/i || /^Non_Break$/i;
183}
184
185# 'qc' is a pseudo-property matching any quick-check property
186# see PropertyValueAliases.txt file comments.  'binprop' is
187# a synthetic binary value alias "True"/"False", not present
188# in PropertyValueAliases.txt until Unicode 5.0.
189# Starting with Unicode 5.1, PropertyValueAliases.txt does have
190# explicit values for binary properties.
191sub isPseudoProperty {
192    $_[0] eq 'qc' ||
193        $_[0] eq 'binprop';
194}
195
196#----------------------------------------------------------------------
197# Emit the combined data from headers and the Unicode database as a
198# C source code header file.
199#
200# @param ref to hash with the data
201# @param Unicode version, as a string
202sub formatData {
203    my $h = shift;
204    my $version = shift;
205
206    my $date = scalar localtime();
207    print <<"END";
208/**
209 * Copyright (C) 2002-$YEAR, International Business Machines Corporation and
210 * others. All Rights Reserved.
211 *
212 * MACHINE GENERATED FILE.  !!! Do not edit manually !!!
213 *
214 * Generated from
215 *   uchar.h
216 *   uscript.h
217 *   Blocks.txt
218 *   PropertyAliases.txt
219 *   PropertyValueAliases.txt
220 *
221 * Date: $date
222 * Unicode version: $version
223 * Script: $0
224 */
225
226END
227
228    #------------------------------------------------------------
229    # Emit Unicode version
230    print "/* Unicode version $version */\n";
231    my @v = split(/\./, $version);
232    push @v, '0' while (@v < 4);
233    for (my $i=0; $i<@v; ++$i) {
234        print "const uint8_t VERSION_$i = $v[$i];\n";
235    }
236    print "\n";
237
238    #------------------------------------------------------------
239    # Emit String table
240    # [A table of all identifiers, that is, all long or short property
241    # or value names.  The list need NOT be sorted; it will be sorted
242    # by the C program.  Strings are referenced by their index into
243    # this table.  After sorting, a REMAP[] array is used to map the
244    # old position indices to the new positions.]
245    my %strings;
246    for my $prop (sort keys %$h) {
247        my $hh = $h->{$prop};
248        for my $enum (sort keys %$hh) {
249            my @a = split(/\|/, $hh->{$enum});
250            for (@a) {
251                $strings{$_} = 1 if (length($_));
252            }
253        }
254    }
255    my @strings = sort keys %strings;
256    unshift @strings, "";
257
258    print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n";
259
260    # while printing, create a mapping hash from string table entry to index
261    my %stringToID;
262    print "/* to be sorted */\n";
263    print "const $STRING_CLASS STRING_TABLE[] = {\n";
264    for (my $i=0; $i<@strings; ++$i) {
265        print "    $STRING_CLASS(\"$strings[$i]\", $i),\n";
266        $stringToID{$strings[$i]} = $i;
267    }
268    print "};\n\n";
269
270    # placeholder for the remapping index.  this is used to map
271    # indices that we compute here to indices of the sorted
272    # STRING_TABLE.  STRING_TABLE will be sorted by the C++ program
273    # using the uprv_comparePropertyNames() function.  this will
274    # reshuffle the order.  we then use the indices (passed to the
275    # String constructor) to create a REMAP[] array.
276    print "/* to be filled in */\n";
277    print "int32_t REMAP[", scalar @strings, "];\n\n";
278
279    #------------------------------------------------------------
280    # Emit the name group table
281    # [A table of name groups.  A name group is one or more names
282    # for a property or property value.  The Unicode data files specify
283    # that there may be more than 2, although as of Unicode 3.2 there
284    # are at most 2.  The name group table looks like this:
285    #
286    #  114, -115, 116, -117, 0, -118, 65, -64, ...
287    #  [0]        [2]        [4]      [6]
288    #
289    # The entry at [0] consists of 2 strings, 114 and 115.
290    # The entry at [2] consists of 116 and 117.  The entry at
291    # [4] is one string, 118.  There is always at least one
292    # string; typically there are two.  If there are two, the first
293    # is the SHORT name and the second is the LONG.  If there is
294    # one, then the missing entry (always the short name, in 3.2)
295    # is zero, which is by definition the index of "".  The
296    # 'preferred' name will generally be the LONG name, if there are
297    # more than 2 entries.  The last entry is negative.
298
299    # Build name group list and replace string refs with nameGroup indices
300    my @nameGroups;
301
302    # Check for duplicate name groups, and reuse them if possible
303    my %groupToInt; # Map group strings to ints
304    for my $prop (sort keys %$h) {
305        my $hh = $h->{$prop};
306        for my $enum (sort keys %$hh) {
307            my $groupString = $hh->{$enum};
308            my $i;
309            if (exists $groupToInt{$groupString}) {
310                $i = $groupToInt{$groupString};
311            } else {
312                my @names = split(/\|/, $groupString);
313                die "Error: Wrong number of names in " . $groupString if (@names < 1);
314                $i = @nameGroups; # index of group we are making
315                $groupToInt{$groupString} = $i; # Cache for reuse
316                push @nameGroups, map { $stringToID{$_} } @names;
317                $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end
318            }
319            # now, replace string list with ref to name group
320            $hh->{$enum} = $i;
321        }
322    }
323
324    print "const int32_t NAME_GROUP_COUNT = ",
325          scalar @nameGroups, ";\n\n";
326
327    print "int32_t NAME_GROUP[] = {\n";
328    # emit one group per line, with annotations
329    my $max_names = 0;
330    for (my $i=0; $i<@nameGroups; ) {
331        my @a;
332        my $line;
333        my $start = $i;
334        for (;;) {
335            my $j = $nameGroups[$i++];
336            $line .= "$j, ";
337            push @a, abs($j);
338            last if ($j < 0);
339        }
340        print "    ",
341              $line,
342              ' 'x(20-length($line)),
343              "/* ", sprintf("%3d", $start),
344              ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
345        $max_names = @a if(@a > $max_names);
346
347    }
348    print "};\n\n";
349
350    # This is fixed for 3.2 at "2" but should be calculated dynamically
351    # when more than 2 names appear in Property[Value]Aliases.txt.
352    print "#define MAX_NAMES_PER_GROUP $max_names\n\n";
353
354    #------------------------------------------------------------
355    # Emit enumerated property values
356    for my $prop (sort keys %$h) {
357        next if ($prop =~ /^_/);
358        my $vh = $h->{$prop};
359        my $count = scalar keys %$vh;
360
361        print "const int32_t VALUES_${prop}_COUNT = ",
362              $count, ";\n\n";
363
364        print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n";
365        for my $enum (sort keys %$vh) {
366            #my @names = split(/\|/, $vh->{$enum});
367            #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]"
368            #    if (@names != 2);
369            print "    $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n";
370                  #$stringToID{$names[0]}, ", ",
371                  #$stringToID{$names[1]}, "),\n";
372            #      "\"", $names[0], "\", ",
373            #      "\"", $names[1], "\"),\n";
374        }
375        print "};\n\n";
376    }
377
378    #------------------------------------------------------------
379    # Emit top-level properties (binary, enumerated, etc.)
380    for my $topName (sort keys %TOP_PROPS) {
381        my $a = $TOP_PROPS{$topName};
382        my $count = 0;
383        for my $type (@$a) { # "_bp", "_ep", etc.
384            $count += scalar keys %{$h->{$type}};
385        }
386
387        print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n";
388
389        print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n";
390
391        for my $type (@$a) { # "_bp", "_ep", etc.
392            my $p = $h->{$type};
393
394            for my $enum (sort keys %$p) {
395                my $name = $strings[$nameGroups[$p->{$enum}]];
396
397                my $valueRef = "0, NULL";
398                if ($type eq '_bp') {
399                    $valueRef = "VALUES_binprop_COUNT, VALUES_binprop";
400                }
401                elsif (exists $h->{$name}) {
402                    $valueRef = "VALUES_${name}_COUNT, VALUES_$name";
403                }
404
405                print "    $PROPERTY_CLASS((int32_t) $enum, ",
406                      $p->{$enum}, ", $valueRef),\n";
407            }
408        }
409        print "};\n\n";
410    }
411
412    print "/*eof*/\n";
413}
414
415#----------------------------------------------------------------------
416# Read in the files uchar.h, uscript.h, Blocks.txt,
417# PropertyAliases.txt, and PropertyValueAliases.txt,
418# and combine them into one hash.
419#
420# @param directory containing headers
421# @param directory containin Unicode data files
422#
423# @return hash ref, Unicode version
424sub readAndMerge {
425
426    my ($headerDir, $unidataDir) = @_;
427
428    my $h = read_uchar("$headerDir/uchar.h");
429    my $s = read_uscript("$headerDir/uscript.h");
430    my $b = read_Blocks("$unidataDir/Blocks.txt");
431    my $pa = {};
432    read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
433    read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
434    my $va = {};
435    read_PropertyValueAliases($va, "$unidataDir/PropertyValueAliases.txt");
436    read_PropertyValueAliases($va, "SyntheticPropertyValueAliases.txt");
437
438    # Extract property family hash
439    my $fam = $pa->{'_family'};
440    delete $pa->{'_family'};
441
442    # Note: uscript.h has no version string, so don't check it
443    my $version = check_versions([ 'uchar.h', $h ],
444                                 [ 'Blocks.txt', $b ],
445                                 [ 'PropertyAliases.txt', $pa ],
446                                 [ 'PropertyValueAliases.txt', $va ]);
447
448    # Do this BEFORE merging; merging modifies the hashes
449    check_PropertyValueAliases($pa, $va);
450
451    # Dump out the $va hash for debugging
452    if ($DEBUG) {
453        print "Property values hash:\n";
454        for my $key (sort keys %$va) {
455            my $hh = $va->{$key};
456            for my $subkey (sort keys %$hh) {
457                print "$key:$subkey:", $hh->{$subkey}, "\n";
458            }
459        }
460    }
461
462    # Dump out the $s hash for debugging
463    if ($DEBUG) {
464        print "Script hash:\n";
465        for my $key (sort keys %$s) {
466            print "$key:", $s->{$key}, "\n";
467        }
468    }
469
470    # Link in the script data
471    $h->{'sc'} = $s;
472
473    merge_Blocks($h, $b);
474
475    merge_PropertyAliases($h, $pa, $fam);
476
477    merge_PropertyValueAliases($h, $va);
478
479    ($h, $version);
480}
481
482#----------------------------------------------------------------------
483# Ensure that the version strings in the given hashes (under the key
484# '_version') are compatible.  Currently this means they must be
485# identical, with the exception that "X.Y" will match "X.Y.0".
486# All hashes must define the key '_version'.
487#
488# @param a list of pairs of (file name, hash reference)
489#
490# @return the version of all the hashes.  Upon return, the '_version'
491# will be removed from all hashes.
492sub check_versions {
493    my $version = '';
494    my $msg = '';
495    foreach my $a (@_) {
496        my $name = $a->[0];
497        my $h    = $a->[1];
498        die "Error: No version found" unless (exists $h->{'_version'});
499        my $v = $h->{'_version'};
500        delete $h->{'_version'};
501
502        # append ".0" if necessary, to standardize to X.Y.Z
503        $v .= '.0' unless ($v =~ /\.\d+\./);
504        $v .= '.0' unless ($v =~ /\.\d+\./);
505        $msg .= "$name = $v\n";
506        if ($version) {
507            die "Error: Mismatched Unicode versions\n$msg"
508                unless ($version eq $v);
509        } else {
510            $version = $v;
511        }
512    }
513    $version;
514}
515
516#----------------------------------------------------------------------
517# Make sure the property names in PropertyValueAliases.txt match those
518# in PropertyAliases.txt.
519#
520# @param a hash ref from read_PropertyAliases.
521# @param a hash ref from read_PropertyValueAliases.
522sub check_PropertyValueAliases {
523    my ($pa, $va) = @_;
524
525    # make a reverse hash of short->long
526    my %rev;
527    for (keys %$pa) { $rev{$pa->{$_}} = $_; }
528
529    for my $prop (keys %$va) {
530        if (!exists $rev{$prop} && !isPseudoProperty($prop)) {
531            print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n";
532        }
533    }
534}
535
536#----------------------------------------------------------------------
537# Merge blocks data into uchar.h enum data.  In the 'blk' subhash all
538# code point values, as returned from read_uchar, are replaced by
539# block names, as read from Blocks.txt and returned by read_Blocks.
540# The match must be 1-to-1.  If there is any failure of 1-to-1
541# mapping, an error is signaled.  Upon return, the read_Blocks hash
542# is emptied of all contents, except for those that failed to match.
543#
544# The mapping in the 'blk' subhash, after this function returns, is
545# from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h
546# pseudo-name, e.g. "Basic Latin".
547#
548# @param a hash ref from read_uchar.
549# @param a hash ref from read_Blocks.
550sub merge_Blocks {
551    my ($h, $b) = @_;
552
553    die "Error: No blocks data in uchar.h"
554        unless (exists $h->{'blk'});
555    my $blk = $h->{'blk'};
556    for my $enum (keys %$blk) {
557        my $cp = $blk->{$enum};
558        if ($cp && !exists $b->{$cp}) {
559            die "Error: No block found at $cp in Blocks.txt";
560        }
561        # Convert code point to pseudo-name:
562        $blk->{$enum} = $b->{$cp};
563        delete $b->{$cp};
564    }
565    my $err = '';
566    for my $cp (keys %$b) {
567        $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n";
568    }
569    die $err if ($err);
570}
571
572#----------------------------------------------------------------------
573# Merge property alias names into the uchar.h hash.  The subhashes
574# under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are
575# examined and the values of those subhashes are assumed to be long
576# names in PropertyAliases.txt.  They are validated and replaced by
577# "<short>|<long>".  Upon return, the read_PropertyAliases hash is
578# emptied of all contents, except for those that failed to match.
579# Unmatched names in PropertyAliases are listed as a warning but do
580# NOT cause the script to die.
581#
582# @param a hash ref from read_uchar.
583# @param a hash ref from read_PropertyAliases.
584# @param a hash mapping long names to property family (e.g., 'binary')
585sub merge_PropertyAliases {
586    my ($h, $pa, $fam) = @_;
587
588    for my $k (@TOP) {
589        die "Error: No properties data for $k in uchar.h"
590            unless (exists $h->{$k});
591    }
592
593    for my $subh (map { $h->{$_} } @TOP) {
594        for my $enum (keys %$subh) {
595            my $long_name = $subh->{$enum};
596            if (!exists $pa->{$long_name}) {
597                die "Error: Property $long_name not found (or used more than once)";
598            }
599
600            my $value;
601            if($pa->{$long_name} =~ m|^n/a\d*$|) {
602                # replace an "n/a" short name with an empty name (nothing before "|");
603                # don't remove it (don't remove the "|"): there must always be a long name,
604                # and if the short name is removed, then the long name becomes the
605                # short name and there is no long name left (unless there is another alias)
606                $value = "|" . $long_name;
607            } else {
608                $value = $pa->{$long_name} . "|" . $long_name;
609            }
610            if (exists $additional_property_aliases{$long_name}) {
611                $value .= "|" . $additional_property_aliases{$long_name};
612            }
613            $subh->{$enum} = $value;
614            delete $pa->{$long_name};
615        }
616    }
617
618    my @err;
619    for my $name (keys %$pa) {
620        $MISSING_FROM_UCHAR{$pa->{$name}} = 1;
621        if (exists $UNSUPPORTED{$name}) {
622            push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h";
623        } elsif (!isIgnoredProperty($name)) {
624            push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h";
625        }
626    }
627    print join("\n", sort @err), "\n" if (@err);
628}
629
630#----------------------------------------------------------------------
631# Return 1 if two names match ignoring whitespace, '-', and '_'.
632# Used to match names in Blocks.txt with those in PropertyValueAliases.txt
633# as of Unicode 4.0.
634sub matchesLoosely {
635    my ($a, $b) = @_;
636    $a =~ s/[\s\-_]//g;
637    $b =~ s/[\s\-_]//g;
638    $a =~ /^$b$/i;
639}
640
641#----------------------------------------------------------------------
642# Merge PropertyValueAliases.txt data into the uchar.h hash.  All
643# properties other than blk, _bp, and _ep are analyzed and mapped to
644# the names listed in PropertyValueAliases.  They are then replaced
645# with a string of the form "<short>|<long>".  The short or long name
646# may be missing.
647#
648# @param a hash ref from read_uchar.
649# @param a hash ref from read_PropertyValueAliases.
650sub merge_PropertyValueAliases {
651    my ($h, $va) = @_;
652
653    my %gcCount;
654    for my $prop (keys %$h) {
655        # _bp, _ep handled in merge_PropertyAliases
656        next if ($prop =~ /^_/);
657
658        # Special case: gcm
659        my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop;
660
661        # find corresponding PropertyValueAliases data
662        die "Error: Can't find $prop in PropertyValueAliases.txt"
663            unless (exists $va->{$prop2});
664        my $pva = $va->{$prop2};
665
666        # match up data
667        my $hh = $h->{$prop};
668        for my $enum (keys %$hh) {
669
670            my $name = $hh->{$enum};
671
672            # look up both long and short & ignore case
673            my $n;
674            if (exists $pva->{$name}) {
675                $n = $name;
676            } else {
677                # iterate (slow)
678                for my $a (keys %$pva) {
679                    # case-insensitive match
680                    # & case-insensitive reverse match
681                    if ($a =~ /^$name$/i ||
682                        $pva->{$a} =~ /^$name$/i) {
683                        $n = $a;
684                        last;
685                    }
686                }
687            }
688
689            # For blocks, do a loose match from Blocks.txt pseudo-name
690            # to PropertyValueAliases long name.
691            if (!$n && $prop eq 'blk') {
692                for my $a (keys %$pva) {
693                    # The block is only going to match the long name,
694                    # but we check both for completeness.  As of Unicode
695                    # 4.0, blocks do not have short names.
696                    if (matchesLoosely($name, $pva->{$a}) ||
697                        matchesLoosely($name, $a)) {
698                        $n = $a;
699                        last;
700                    }
701                }
702            }
703
704            die "Error: Property value $prop:$name not found" unless ($n);
705
706            my $l = $n;
707            my $r = $pva->{$n};
708            # convert |n/a\d*| to blank
709            $l = '' if ($l =~ m|^n/a\d*$|);
710            $r = '' if ($r =~ m|^n/a\d*$|);
711
712            $hh->{$enum} = "$l|$r";
713            # Don't delete the 'gc' properties because we need to share
714            # them between 'gc' and 'gcm'.  Count each use instead.
715            if ($prop2 eq 'gc') {
716                ++$gcCount{$n};
717            } else {
718                delete $pva->{$n};
719            }
720        }
721    }
722
723    # Merge the combining class values in manually
724    # Add the same values to the synthetic lccc and tccc properties
725    die "Error: No ccc data"
726        unless exists $va->{'ccc'};
727    for my $ccc (keys %{$va->{'ccc'}}) {
728        die "Error: Can't overwrite ccc $ccc"
729            if (exists $h->{'ccc'}->{$ccc});
730        $h->{'lccc'}->{$ccc} =
731        $h->{'tccc'}->{$ccc} =
732        $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
733    }
734    delete $va->{'ccc'};
735
736    # Merge synthetic binary property values in manually.
737    # These are the "True" and "False" value aliases.
738    die "Error: No True/False value aliases"
739        unless exists $va->{'binprop'};
740    for my $bp (keys %{$va->{'binprop'}}) {
741        $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp};
742    }
743    delete $va->{'binprop'};
744
745    my $err = '';
746    for my $prop (sort keys %$va) {
747        my $hh = $va->{$prop};
748        for my $subkey (sort keys %$hh) {
749            # 'gc' props are shared with 'gcm'; make sure they were used
750            # once or twice.
751            if ($prop eq 'gc') {
752                my $n = $gcCount{$subkey};
753                next if ($n >= 1 && $n <= 2);
754            }
755            $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n"
756                unless exists $MISSING_FROM_UCHAR{$prop};
757        }
758    }
759    print $err if ($err);
760}
761
762#----------------------------------------------------------------------
763# Read the PropertyAliases.txt file.  Return a hash that maps the long
764# name to the short name.  The special key '_version' will map to the
765# Unicode version of the file.  The special key '_family' holds a
766# subhash that maps long names to a family string, for descriptive
767# purposes.
768#
769# @param a filename for PropertyAliases.txt
770# @param reference to hash to receive data.  Keys are long names.
771# Values are short names.
772sub read_PropertyAliases {
773
774    my $hash = shift;         # result
775
776    my $filename = shift;
777
778    my $fam = {};  # map long names to family string
779    $fam = $hash->{'_family'} if (exists $hash->{'_family'});
780
781    my $family; # binary, enumerated, etc.
782
783    my $in = new FileHandle($filename, 'r');
784    die "Error: Cannot open $filename" if (!defined $in);
785
786    while (<$in>) {
787
788        # Read version (embedded in a comment)
789        if (/PropertyAliases-(\d+\.\d+\.\d+)/i) {
790            die "Error: Multiple versions in $filename"
791                if (exists $hash->{'_version'});
792            $hash->{'_version'} = $1;
793        }
794
795        # Read family heading
796        if (/^\s*\#\s*(.+?)\s*Properties\s*$/) {
797            $family = $1;
798        }
799
800        # Ignore comments and blank lines
801        s/\#.*//;
802        next unless (/\S/);
803
804        if (/^\s*(.+?)\s*;/) {
805            my $short = $1;
806            my @fields = /;\s*([^\s;]+)/g;
807            if (@fields < 1) {
808                my $number = @fields;
809                die "Error: Wrong number of fields ($number) in $filename at $_";
810            }
811
812            # Make "n/a" strings unique
813            if ($short eq 'n/a') {
814                $short .= sprintf("%03d", $propNA++);
815            }
816            my $long = $fields[0];
817            if ($long eq 'n/a') {
818                $long .= sprintf("%03d", $propNA++);
819            }
820
821            # Add long name->short name to the hash=pa hash table
822            if (exists $hash->{$long}) {
823                die "Error: Duplicate property $long in $filename"
824            }
825            $hash->{$long} = $short;
826            $fam->{$long} = $family;
827
828            # Add the list of further aliases to the additional_property_aliases hash table,
829            # using the long property name as the key.
830            # For example:
831            #   White_Space->space|outer_space
832            if (@fields > 1) {
833                my $value = pop @fields;
834                while (@fields > 1) {
835                    $value .= "|" . pop @fields;
836                }
837                $additional_property_aliases{$long} = $value;
838            }
839        } else {
840            die "Error: Can't parse $_ in $filename";
841        }
842    }
843
844    $in->close();
845
846    $hash->{'_family'} = $fam;
847}
848
849#----------------------------------------------------------------------
850# Read the PropertyValueAliases.txt file.  Return a two level hash
851# that maps property_short_name:value_short_name:value_long_name.  In
852# the case of the 'ccc' property, the short name is the numeric class
853# and the long name is "<short>|<long>".  The special key '_version'
854# will map to the Unicode version of the file.
855#
856# @param a filename for PropertyValueAliases.txt
857#
858# @return a hash reference.
859sub read_PropertyValueAliases {
860
861    my $hash = shift;         # result
862
863    my $filename = shift;
864
865    my $in = new FileHandle($filename, 'r');
866    die "Error: Cannot open $filename" if (!defined $in);
867
868    while (<$in>) {
869
870        # Read version (embedded in a comment)
871        if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) {
872            die "Error: Multiple versions in $filename"
873                if (exists $hash->{'_version'});
874            $hash->{'_version'} = $1;
875        }
876
877        # Ignore comments and blank lines
878        s/\#.*//;
879        next unless (/\S/);
880
881        if (/^\s*(.+?)\s*;/i) {
882            my $prop = $1;
883            my @fields = /;\s*([^\s;]+)/g;
884            die "Error: Wrong number of fields in $filename"
885                if (@fields < 2 || @fields > 5);
886            # Make "n/a" strings unique
887            $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
888            # Squash extra fields together
889            while (@fields > 2) {
890                my $f = pop @fields;
891                $fields[$#fields] .= '|' . $f;
892            }
893            addDatum($hash, $prop, @fields);
894        }
895
896        else {
897            die "Error: Can't parse $_ in $filename";
898        }
899    }
900
901    $in->close();
902
903    # Script Copt=Qaac (Coptic) is a special case.
904    # Before the Copt code was defined, the private-use code Qaac was used.
905    # Starting with Unicode 4.1, PropertyValueAliases.txt contains
906    # Copt as the short name as well as Qaac as an alias.
907    # For use with older Unicode data files, we add here a Qaac->Coptic entry.
908    # This should not do anything for 4.1-and-later Unicode data files.
909    # See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/
910    $hash->{'sc'}->{'Qaac'} = 'Coptic'
911        unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'});
912
913    # Add N|No|T|True and Y|Yes|F|False -- these are values we recognize for
914    # binary properties (until Unicode 5.0 NOT from PropertyValueAliases.txt).
915    # These are of the same form as the 'ccc' value aliases.
916    # Starting with Unicode 5.1, PropertyValueAliases.txt does have values
917    # for binary properties.
918    if (!exists $hash->{'binprop'}->{'0'}) {
919        if (exists $hash->{'Alpha'}->{'N'}) {
920            # Unicode 5.1 and later: Make the numeric value the key.
921            $hash->{'binprop'}->{'0'} = 'N|' . $hash->{'Alpha'}->{'N'};
922            $hash->{'binprop'}->{'1'} = 'Y|' . $hash->{'Alpha'}->{'Y'};
923        } elsif (exists $hash->{'Alpha'}) {
924            die "Error: Unrecognized short value name for binary property 'Alpha'\n";
925        } else {
926            # Unicode 5.0 and earlier: Add manually.
927            $hash->{'binprop'}->{'0'} = 'N|No|F|False';
928            $hash->{'binprop'}->{'1'} = 'Y|Yes|T|True';
929        }
930    }
931}
932
933#----------------------------------------------------------------------
934# Read the Blocks.txt file.  Return a hash that maps the code point
935# range start to the block name.  The special key '_version' will map
936# to the Unicode version of the file.
937#
938# As of Unicode 4.0, the names in the Blocks.txt are no longer the
939# proper names.  The proper names are now listed in PropertyValueAliases.
940# They are similar but not identical.  Furthermore, 4.0 introduces
941# a new block name, No_Block, which is listed only in PropertyValueAliases
942# and not in Blocks.txt.  As a result, we handle blocks as follows:
943#
944# 1. Read Blocks.txt to map code point range start to quasi-block name.
945# 2. Add to Blocks.txt a synthetic No Block code point & name:
946#    X -> No Block
947# 3. Map quasi-names from Blocks.txt (including No Block) to actual
948#    names from PropertyValueAliases.  This occurs in
949#    merge_PropertyValueAliases.
950#
951# @param a filename for Blocks.txt
952#
953# @return a ref to a hash.  Keys are code points, as text, e.g.,
954# "1720".  Values are pseudo-block names, e.g., "Hanunoo".
955sub read_Blocks {
956
957    my $filename = shift;
958
959    my $hash = {};         # result
960
961    my $in = new FileHandle($filename, 'r');
962    die "Error: Cannot open $filename" if (!defined $in);
963
964    while (<$in>) {
965
966        # Read version (embedded in a comment)
967        if (/Blocks-(\d+\.\d+\.\d+)/i) {
968            die "Error: Multiple versions in $filename"
969                if (exists $hash->{'_version'});
970            $hash->{'_version'} = $1;
971        }
972
973        # Ignore comments and blank lines
974        s/\#.*//;
975        next unless (/\S/);
976
977        if (/^([0-9a-f]+)\.\.[0-9a-f]+\s*;\s*(.+?)\s*$/i) {
978            die "Error: Duplicate range $1 in $filename"
979                if (exists $hash->{$1});
980            $hash->{$1} = $2;
981        }
982
983        else {
984            die "Error: Can't parse $_ in $filename";
985        }
986    }
987
988    $in->close();
989
990    # Add pseudo-name for No Block
991    $hash->{'none'} = 'No Block';
992
993    $hash;
994}
995
996#----------------------------------------------------------------------
997# Read the uscript.h file and compile a mapping of Unicode symbols to
998# icu4c enum values.
999#
1000# @param a filename for uscript.h
1001#
1002# @return a ref to a hash.  The keys of the hash are enum symbols from
1003# uscript.h, and the values are script names.
1004sub read_uscript {
1005
1006    my $filename = shift;
1007
1008    my $mode = '';         # state machine mode and submode
1009    my $submode = '';
1010
1011    my $last = '';         # for line folding
1012
1013    my $hash = {};         # result
1014    my $key;               # first-level key
1015
1016    my $in = new FileHandle($filename, 'r');
1017    die "Error: Cannot open $filename" if (!defined $in);
1018
1019    while (<$in>) {
1020        # Fold continued lines together
1021        if (/^(.*)\\$/) {
1022            $last = $1;
1023            next;
1024        } elsif ($last) {
1025            $_ = $last . $_;
1026            $last = '';
1027        }
1028
1029        # Exit all modes here
1030        if ($mode && $mode ne 'DEPRECATED') {
1031            if (/^\s*\}/) {
1032                $mode = '';
1033                next;
1034            }
1035        }
1036
1037        # Handle individual modes
1038
1039        if ($mode eq 'UScriptCode') {
1040            if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) {
1041                my ($enum, $code) = ($1, $2);
1042                die "Error: Duplicate script $enum"
1043                    if (exists $hash->{$enum});
1044                $hash->{$enum} = $code;
1045            }
1046        }
1047
1048        elsif ($mode eq 'DEPRECATED') {
1049            if (/\s*\#ifdef/) {
1050                die "Error: Nested #ifdef";
1051                }
1052            elsif (/\s*\#endif/) {
1053                $mode = '';
1054            }
1055        }
1056
1057        elsif (!$mode) {
1058            if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
1059                   /^\s*typedef\s+enum\s+(\w+)\s*$/) {
1060                $mode = $1;
1061                #print "Parsing $mode\n";
1062            }
1063
1064            elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
1065                $mode = 'DEPRECATED';
1066            }
1067        }
1068    }
1069
1070    $in->close();
1071
1072    $hash;
1073}
1074
1075#----------------------------------------------------------------------
1076# Read the uchar.h file and compile a mapping of Unicode symbols to
1077# icu4c enum values.
1078#
1079# @param a filename for uchar.h
1080#
1081# @return a ref to a hash.  The keys of the hash are '_bp' for binary
1082# properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for
1083# double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk',
1084# 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property
1085# value aliases.  The values of the hash are subhashes.  The subhashes
1086# have a key of the uchar.h enum symbol, and a value of the alias
1087# string (as listed in PropertyValueAliases.txt).  NOTE: The alias
1088# string is whatever alias uchar.h lists.  This may be either short or
1089# long, depending on the specific enum.  NOTE: For blocks ('blk'), the
1090# value is a hex code point for the start of the associated block.
1091# NOTE: The special key _version will map to the Unicode version of
1092# the file.
1093sub read_uchar {
1094
1095    my $filename = shift;
1096
1097    my $mode = '';         # state machine mode and submode
1098    my $submode = '';
1099
1100    my $last = '';         # for line folding
1101
1102    my $hash = {};         # result
1103    my $key;               # first-level key
1104
1105    my $in = new FileHandle($filename, 'r');
1106    die "Error: Cannot open $filename" if (!defined $in);
1107
1108    while (<$in>) {
1109        # Fold continued lines together
1110        if (/^(.*)\\$/) {
1111            $last .= $1;
1112            next;
1113        } elsif ($last) {
1114            $_ = $last . $_;
1115            $last = '';
1116        }
1117
1118        # Exit all modes here
1119        if ($mode && $mode ne 'DEPRECATED') {
1120            if (/^\s*\}/) {
1121                $mode = '';
1122                next;
1123            }
1124        }
1125
1126        # Handle individual modes
1127
1128        if ($mode eq 'UProperty') {
1129            if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) {
1130                if ($submode) {
1131                    addDatum($hash, $key, $1, $submode);
1132                    $submode = '';
1133                } else {
1134                    #print "Warning: Ignoring $1\n";
1135                }
1136            }
1137
1138            elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) {
1139                die "Error: Unmatched tag $submode" if ($submode);
1140                die "Error: Unrecognized UProperty comment: $_"
1141                    unless (exists $PROP_TYPE{$1});
1142                $key = $PROP_TYPE{$1};
1143                $submode = $2;
1144            }
1145        }
1146
1147        elsif ($mode eq 'UCharCategory') {
1148            if (/^\s*(U_\w+)\s*=/) {
1149                if ($submode) {
1150                    addDatum($hash, 'gc', $1, $submode);
1151                    $submode = '';
1152                } else {
1153                    #print "Warning: Ignoring $1\n";
1154                }
1155            }
1156
1157            elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) {
1158                die "Error: Unmatched tag $submode" if ($submode);
1159                $submode = $1;
1160            }
1161        }
1162
1163        elsif ($mode eq 'UCharDirection') {
1164            if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) {
1165                if ($submode) {
1166                    addDatum($hash, $key, $1, $submode);
1167                    $submode = '';
1168                } else {
1169                    #print "Warning: Ignoring $1\n";
1170                }
1171            }
1172
1173            elsif (m|/\*\*\s*([A-Z]+)\s|) {
1174                die "Error: Unmatched tag $submode" if ($submode);
1175                $key = 'bc';
1176                $submode = $1;
1177            }
1178        }
1179
1180        elsif ($mode eq 'UBlockCode') {
1181            if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) {
1182                addDatum($hash, 'blk', $1, $2);
1183            }
1184        }
1185
1186        elsif ($mode eq 'UEastAsianWidth') {
1187            if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) {
1188                addDatum($hash, 'ea', $1, $2);
1189            }
1190        }
1191
1192        elsif ($mode eq 'UDecompositionType') {
1193            if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) {
1194                addDatum($hash, 'dt', $1, $2);
1195            }
1196        }
1197
1198        elsif ($mode eq 'UJoiningType') {
1199            if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) {
1200                addDatum($hash, 'jt', $1, $2);
1201            }
1202        }
1203
1204        elsif ($mode eq 'UJoiningGroup') {
1205            if (/^\s*(U_JG_(\w+))/) {
1206                addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT');
1207            }
1208        }
1209
1210        elsif ($mode eq 'UGraphemeClusterBreak') {
1211            if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) {
1212                addDatum($hash, 'GCB', $1, $2);
1213            }
1214        }
1215
1216        elsif ($mode eq 'UWordBreakValues') {
1217            if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) {
1218                addDatum($hash, 'WB', $1, $2);
1219            }
1220        }
1221
1222        elsif ($mode eq 'USentenceBreak') {
1223            if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) {
1224                addDatum($hash, 'SB', $1, $2);
1225            }
1226        }
1227
1228        elsif ($mode eq 'ULineBreak') {
1229            if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) {
1230                addDatum($hash, 'lb', $1, $2);
1231            }
1232        }
1233
1234        elsif ($mode eq 'UNumericType') {
1235            if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) {
1236                addDatum($hash, 'nt', $1, $2);
1237            }
1238        }
1239
1240        elsif ($mode eq 'UHangulSyllableType') {
1241            if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) {
1242                addDatum($hash, 'hst', $1, $2);
1243            }
1244        }
1245
1246        elsif ($mode eq 'DEPRECATED') {
1247            if (/\s*\#ifdef/) {
1248                die "Error: Nested #ifdef";
1249                }
1250            elsif (/\s*\#endif/) {
1251                $mode = '';
1252            }
1253        }
1254
1255        elsif (!$mode) {
1256            if (/^\s*\#define\s+(\w+)\s+(.+)/) {
1257                # #define $left $right
1258                my ($left, $right) = ($1, $2);
1259
1260                if ($left eq 'U_UNICODE_VERSION') {
1261                    my $version = $right;
1262                    $version = $1 if ($version =~ /^\"(.*)\"/);
1263                    # print "Unicode version: ", $version, "\n";
1264                    die "Error: Multiple versions in $filename"
1265                        if (defined $hash->{'_version'});
1266                    $hash->{'_version'} = $version;
1267                }
1268
1269                elsif ($left =~ /U_GC_(\w+?)_MASK/) {
1270                    addDatum($hash, 'gcm', $left, $1);
1271                }
1272            }
1273
1274            elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
1275                   /^\s*typedef\s+enum\s+(\w+)\s*$/) {
1276                $mode = $1;
1277                #print "Parsing $mode\n";
1278            }
1279
1280            elsif (/^\s*enum\s+(\w+)\s*\{/ ||
1281                   /^\s*enum\s+(\w+)\s*$/) {
1282                $mode = $1;
1283                #print "Parsing $mode\n";
1284            }
1285
1286            elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
1287                $mode = 'DEPRECATED';
1288            }
1289        }
1290    }
1291
1292    $in->close();
1293
1294    # hardcode known values for the normalization quick check properties
1295    # see unorm.h for the UNormalizationCheckResult enum
1296
1297    addDatum($hash, 'NFC_QC', 'UNORM_NO',    'N');
1298    addDatum($hash, 'NFC_QC', 'UNORM_YES',   'Y');
1299    addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');
1300
1301    addDatum($hash, 'NFKC_QC', 'UNORM_NO',    'N');
1302    addDatum($hash, 'NFKC_QC', 'UNORM_YES',   'Y');
1303    addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');
1304
1305    # no "maybe" values for NF[K]D
1306
1307    addDatum($hash, 'NFD_QC', 'UNORM_NO',    'N');
1308    addDatum($hash, 'NFD_QC', 'UNORM_YES',   'Y');
1309
1310    addDatum($hash, 'NFKD_QC', 'UNORM_NO',    'N');
1311    addDatum($hash, 'NFKD_QC', 'UNORM_YES',   'Y');
1312
1313    $hash;
1314}
1315
1316#----------------------------------------------------------------------
1317# Add a new value to a two-level hash.  That is, given a ref to
1318# a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value.
1319sub addDatum {
1320    my ($h, $k1, $k2, $v) = @_;
1321    if (exists $h->{$k1}->{$k2}) {
1322        die "Error: $k1:$k2 already set to " .
1323            $h->{$k1}->{$k2} . ", cannot set to " . $v;
1324    }
1325    $h->{$k1}->{$k2} = $v;
1326}
1327
1328#eof
1329