1#!/bin/perl -w 2#******************************************************************* 3# COPYRIGHT: 4# Copyright (c) 2002-2009, International Business Machines Corporation and 5# others. All Rights Reserved. 6#******************************************************************* 7 8# This script reads in UCD files PropertyAliases.txt and 9# PropertyValueAliases.txt and correlates them with ICU enums 10# defined in uchar.h and uscript.h. It then outputs a header 11# file which contains all names and enums. The header is included 12# by the genpname tool C++ source file, which produces the actual 13# binary data file. 14# 15# See usage note below. 16# 17# TODO: The Property[Value]Alias.txt files state that they can support 18# more than 2 names per property|value. Currently (Unicode 3.2) there 19# are always 1 or 2 names. If more names were supported, presumably 20# the format would be something like: 21# nv ; Numeric_Value 22# nv ; Value_Numerique 23# CURRENTLY, this script assumes that there are 1 or two names. Any 24# duplicates it sees are flagged as an error. If multiple aliases 25# appear in a future version of Unicode, modify this script to support 26# that. 27# 28# NOTE: As of ICU 2.6, this script has been modified to know about the 29# pseudo-property gcm/General_Category_Mask, which corresponds to the 30# uchar.h property UCHAR_GENERAL_CATEGORY_MASK. This property 31# corresponds to General_Category but is a bitmask value. It does not 32# exist in the UCD. Therefore, I special case it in several places 33# (search for General_Category_Mask and gcm). 34# 35# NOTE: As of ICU 2.6, this script reads an auxiliary data file, 36# SyntheticPropertyAliases.txt, containing property aliases not 37# present in the UCD but present in ICU. This file resides in the 38# same directory as this script. Its contents are merged into those 39# of PropertyAliases.txt as if the two files were appended. 40# 41# NOTE: The following names are handled specially. See script below 42# for details. 43# 44# T/True 45# F/False 46# No_Block 47# 48# Author: Alan Liu 49# Created: October 14 2002 50# Since: ICU 2.4 51 52use FileHandle; 53use strict; 54use Dumpvalue; 55 56my $DEBUG = 1; 57my $DUMPER = new Dumpvalue; 58 59my $count = @ARGV; 60my $ICU_DIR = shift() || ''; 61my $OUT_FILE = shift() || 'data.h'; 62my $HEADER_DIR = "$ICU_DIR/source/common/unicode"; 63my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata"; 64 65# Get the current year from the system 66my $YEAR = 1900+@{[localtime]}[5]; # Get the current year 67 68# Used to make "n/a" property [value] aliases (Unicode or Synthetic) unique 69my $propNA = 0; 70my $valueNA = 0; 71 72#---------------------------------------------------------------------- 73# Top level property keys for binary, enumerated, string, and double props 74my @TOP = qw( _bp _ep _sp _dp _mp ); 75 76# This hash governs how top level properties are grouped into output arrays. 77#my %TOP_PROPS = ( "VALUED" => [ '_bp', '_ep' ], 78# "NO_VALUE" => [ '_sp', '_dp' ] );m 79#my %TOP_PROPS = ( "BINARY" => [ '_bp' ], 80# "ENUMERATED" => [ '_ep' ], 81# "STRING" => [ '_sp' ], 82# "DOUBLE" => [ '_dp' ] ); 83my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] ); 84 85my %PROP_TYPE = (Binary => "_bp", 86 String => "_sp", 87 Double => "_dp", 88 Enumerated => "_ep", 89 Bitmask => "_mp"); 90#---------------------------------------------------------------------- 91 92# Properties that are unsupported in ICU 93my %UNSUPPORTED = (Composition_Exclusion => 1, 94 Decomposition_Mapping => 1, 95 Expands_On_NFC => 1, 96 Expands_On_NFD => 1, 97 Expands_On_NFKC => 1, 98 Expands_On_NFKD => 1, 99 FC_NFKC_Closure => 1, 100 ID_Start_Exceptions => 1, 101 Special_Case_Condition => 1, 102 ); 103 104# Short names of properties that weren't seen in uchar.h. If the 105# properties weren't seen, don't complain about the property values 106# missing. 107my %MISSING_FROM_UCHAR; 108 109# Additional property aliases beyond short and long names, 110# like space in addition to WSpace and White_Space in Unicode 4.1. 111# Hashtable, maps long name to alias. 112# For example, maps White_Space->space. 113# 114# If multiple additional aliases are defined, 115# then they are separated in the value string with '|'. 116# For example, White_Space->space|outer_space 117my %additional_property_aliases; 118 119#---------------------------------------------------------------------- 120 121# Emitted class names 122my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property); 123 124if ($count < 1 || $count > 2 || 125 !-d $HEADER_DIR || 126 !-d $UNIDATA_DIR) { 127 my $me = $0; 128 $me =~ s|.+[/\\]||; 129 my $lm = ' ' x length($me); 130 print <<"END"; 131 132$me: Reads ICU4C headers and Unicode data files and creates 133$lm a C header file that is included by genpname. The header 134$lm file matches constants defined in the ICU4C headers with 135$lm property|value aliases in the Unicode data files. 136 137Usage: $me <icu_dir> [<out_file>] 138 139<icu_dir> ICU4C root directory, containing 140 source/common/unicode/uchar.h 141 source/common/unicode/uscript.h 142 source/data/unidata/Blocks.txt 143 source/data/unidata/PropertyAliases.txt 144 source/data/unidata/PropertyValueAliases.txt 145<out_file> File name of header to be written; 146 default is 'data.h'. 147 148The Unicode versions of all input files must match. 149END 150 exit(1); 151} 152 153my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR); 154 155if ($DEBUG) { 156 print "Merged hash:\n"; 157 for my $key (sort keys %$h) { 158 my $hh = $h->{$key}; 159 for my $subkey (sort keys %$hh) { 160 print "$key:$subkey:", $hh->{$subkey}, "\n"; 161 } 162 } 163} 164 165my $out = new FileHandle($OUT_FILE, 'w'); 166die "Error: Can't write to $OUT_FILE: $!" unless (defined $out); 167my $save = select($out); 168formatData($h, $version); 169select($save); 170$out->close(); 171 172exit(0); 173 174#---------------------------------------------------------------------- 175# From PropList.html: "The properties of the form Other_XXX 176# are used to generate properties in DerivedCoreProperties.txt. 177# They are not intended for general use, such as in APIs that 178# return property values. 179# Non_Break is not a valid property as of 3.2. 180sub isIgnoredProperty { 181 local $_ = shift; 182 /^Other_/i || /^Non_Break$/i; 183} 184 185# 'qc' is a pseudo-property matching any quick-check property 186# see PropertyValueAliases.txt file comments. 'binprop' is 187# a synthetic binary value alias "True"/"False", not present 188# in PropertyValueAliases.txt until Unicode 5.0. 189# Starting with Unicode 5.1, PropertyValueAliases.txt does have 190# explicit values for binary properties. 191sub isPseudoProperty { 192 $_[0] eq 'qc' || 193 $_[0] eq 'binprop'; 194} 195 196#---------------------------------------------------------------------- 197# Emit the combined data from headers and the Unicode database as a 198# C source code header file. 199# 200# @param ref to hash with the data 201# @param Unicode version, as a string 202sub formatData { 203 my $h = shift; 204 my $version = shift; 205 206 my $date = scalar localtime(); 207 print <<"END"; 208/** 209 * Copyright (C) 2002-$YEAR, International Business Machines Corporation and 210 * others. All Rights Reserved. 211 * 212 * MACHINE GENERATED FILE. !!! Do not edit manually !!! 213 * 214 * Generated from 215 * uchar.h 216 * uscript.h 217 * Blocks.txt 218 * PropertyAliases.txt 219 * PropertyValueAliases.txt 220 * 221 * Date: $date 222 * Unicode version: $version 223 * Script: $0 224 */ 225 226END 227 228 #------------------------------------------------------------ 229 # Emit Unicode version 230 print "/* Unicode version $version */\n"; 231 my @v = split(/\./, $version); 232 push @v, '0' while (@v < 4); 233 for (my $i=0; $i<@v; ++$i) { 234 print "const uint8_t VERSION_$i = $v[$i];\n"; 235 } 236 print "\n"; 237 238 #------------------------------------------------------------ 239 # Emit String table 240 # [A table of all identifiers, that is, all long or short property 241 # or value names. The list need NOT be sorted; it will be sorted 242 # by the C program. Strings are referenced by their index into 243 # this table. After sorting, a REMAP[] array is used to map the 244 # old position indices to the new positions.] 245 my %strings; 246 for my $prop (sort keys %$h) { 247 my $hh = $h->{$prop}; 248 for my $enum (sort keys %$hh) { 249 my @a = split(/\|/, $hh->{$enum}); 250 for (@a) { 251 $strings{$_} = 1 if (length($_)); 252 } 253 } 254 } 255 my @strings = sort keys %strings; 256 unshift @strings, ""; 257 258 print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n"; 259 260 # while printing, create a mapping hash from string table entry to index 261 my %stringToID; 262 print "/* to be sorted */\n"; 263 print "const $STRING_CLASS STRING_TABLE[] = {\n"; 264 for (my $i=0; $i<@strings; ++$i) { 265 print " $STRING_CLASS(\"$strings[$i]\", $i),\n"; 266 $stringToID{$strings[$i]} = $i; 267 } 268 print "};\n\n"; 269 270 # placeholder for the remapping index. this is used to map 271 # indices that we compute here to indices of the sorted 272 # STRING_TABLE. STRING_TABLE will be sorted by the C++ program 273 # using the uprv_comparePropertyNames() function. this will 274 # reshuffle the order. we then use the indices (passed to the 275 # String constructor) to create a REMAP[] array. 276 print "/* to be filled in */\n"; 277 print "int32_t REMAP[", scalar @strings, "];\n\n"; 278 279 #------------------------------------------------------------ 280 # Emit the name group table 281 # [A table of name groups. A name group is one or more names 282 # for a property or property value. The Unicode data files specify 283 # that there may be more than 2, although as of Unicode 3.2 there 284 # are at most 2. The name group table looks like this: 285 # 286 # 114, -115, 116, -117, 0, -118, 65, -64, ... 287 # [0] [2] [4] [6] 288 # 289 # The entry at [0] consists of 2 strings, 114 and 115. 290 # The entry at [2] consists of 116 and 117. The entry at 291 # [4] is one string, 118. There is always at least one 292 # string; typically there are two. If there are two, the first 293 # is the SHORT name and the second is the LONG. If there is 294 # one, then the missing entry (always the short name, in 3.2) 295 # is zero, which is by definition the index of "". The 296 # 'preferred' name will generally be the LONG name, if there are 297 # more than 2 entries. The last entry is negative. 298 299 # Build name group list and replace string refs with nameGroup indices 300 my @nameGroups; 301 302 # Check for duplicate name groups, and reuse them if possible 303 my %groupToInt; # Map group strings to ints 304 for my $prop (sort keys %$h) { 305 my $hh = $h->{$prop}; 306 for my $enum (sort keys %$hh) { 307 my $groupString = $hh->{$enum}; 308 my $i; 309 if (exists $groupToInt{$groupString}) { 310 $i = $groupToInt{$groupString}; 311 } else { 312 my @names = split(/\|/, $groupString); 313 die "Error: Wrong number of names in " . $groupString if (@names < 1); 314 $i = @nameGroups; # index of group we are making 315 $groupToInt{$groupString} = $i; # Cache for reuse 316 push @nameGroups, map { $stringToID{$_} } @names; 317 $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end 318 } 319 # now, replace string list with ref to name group 320 $hh->{$enum} = $i; 321 } 322 } 323 324 print "const int32_t NAME_GROUP_COUNT = ", 325 scalar @nameGroups, ";\n\n"; 326 327 print "int32_t NAME_GROUP[] = {\n"; 328 # emit one group per line, with annotations 329 my $max_names = 0; 330 for (my $i=0; $i<@nameGroups; ) { 331 my @a; 332 my $line; 333 my $start = $i; 334 for (;;) { 335 my $j = $nameGroups[$i++]; 336 $line .= "$j, "; 337 push @a, abs($j); 338 last if ($j < 0); 339 } 340 print " ", 341 $line, 342 ' 'x(20-length($line)), 343 "/* ", sprintf("%3d", $start), 344 ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n"; 345 $max_names = @a if(@a > $max_names); 346 347 } 348 print "};\n\n"; 349 350 # This is fixed for 3.2 at "2" but should be calculated dynamically 351 # when more than 2 names appear in Property[Value]Aliases.txt. 352 print "#define MAX_NAMES_PER_GROUP $max_names\n\n"; 353 354 #------------------------------------------------------------ 355 # Emit enumerated property values 356 for my $prop (sort keys %$h) { 357 next if ($prop =~ /^_/); 358 my $vh = $h->{$prop}; 359 my $count = scalar keys %$vh; 360 361 print "const int32_t VALUES_${prop}_COUNT = ", 362 $count, ";\n\n"; 363 364 print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n"; 365 for my $enum (sort keys %$vh) { 366 #my @names = split(/\|/, $vh->{$enum}); 367 #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]" 368 # if (@names != 2); 369 print " $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n"; 370 #$stringToID{$names[0]}, ", ", 371 #$stringToID{$names[1]}, "),\n"; 372 # "\"", $names[0], "\", ", 373 # "\"", $names[1], "\"),\n"; 374 } 375 print "};\n\n"; 376 } 377 378 #------------------------------------------------------------ 379 # Emit top-level properties (binary, enumerated, etc.) 380 for my $topName (sort keys %TOP_PROPS) { 381 my $a = $TOP_PROPS{$topName}; 382 my $count = 0; 383 for my $type (@$a) { # "_bp", "_ep", etc. 384 $count += scalar keys %{$h->{$type}}; 385 } 386 387 print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n"; 388 389 print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n"; 390 391 for my $type (@$a) { # "_bp", "_ep", etc. 392 my $p = $h->{$type}; 393 394 for my $enum (sort keys %$p) { 395 my $name = $strings[$nameGroups[$p->{$enum}]]; 396 397 my $valueRef = "0, NULL"; 398 if ($type eq '_bp') { 399 $valueRef = "VALUES_binprop_COUNT, VALUES_binprop"; 400 } 401 elsif (exists $h->{$name}) { 402 $valueRef = "VALUES_${name}_COUNT, VALUES_$name"; 403 } 404 405 print " $PROPERTY_CLASS((int32_t) $enum, ", 406 $p->{$enum}, ", $valueRef),\n"; 407 } 408 } 409 print "};\n\n"; 410 } 411 412 print "/*eof*/\n"; 413} 414 415#---------------------------------------------------------------------- 416# Read in the files uchar.h, uscript.h, Blocks.txt, 417# PropertyAliases.txt, and PropertyValueAliases.txt, 418# and combine them into one hash. 419# 420# @param directory containing headers 421# @param directory containin Unicode data files 422# 423# @return hash ref, Unicode version 424sub readAndMerge { 425 426 my ($headerDir, $unidataDir) = @_; 427 428 my $h = read_uchar("$headerDir/uchar.h"); 429 my $s = read_uscript("$headerDir/uscript.h"); 430 my $b = read_Blocks("$unidataDir/Blocks.txt"); 431 my $pa = {}; 432 read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt"); 433 read_PropertyAliases($pa, "SyntheticPropertyAliases.txt"); 434 my $va = {}; 435 read_PropertyValueAliases($va, "$unidataDir/PropertyValueAliases.txt"); 436 read_PropertyValueAliases($va, "SyntheticPropertyValueAliases.txt"); 437 438 # Extract property family hash 439 my $fam = $pa->{'_family'}; 440 delete $pa->{'_family'}; 441 442 # Note: uscript.h has no version string, so don't check it 443 my $version = check_versions([ 'uchar.h', $h ], 444 [ 'Blocks.txt', $b ], 445 [ 'PropertyAliases.txt', $pa ], 446 [ 'PropertyValueAliases.txt', $va ]); 447 448 # Do this BEFORE merging; merging modifies the hashes 449 check_PropertyValueAliases($pa, $va); 450 451 # Dump out the $va hash for debugging 452 if ($DEBUG) { 453 print "Property values hash:\n"; 454 for my $key (sort keys %$va) { 455 my $hh = $va->{$key}; 456 for my $subkey (sort keys %$hh) { 457 print "$key:$subkey:", $hh->{$subkey}, "\n"; 458 } 459 } 460 } 461 462 # Dump out the $s hash for debugging 463 if ($DEBUG) { 464 print "Script hash:\n"; 465 for my $key (sort keys %$s) { 466 print "$key:", $s->{$key}, "\n"; 467 } 468 } 469 470 # Link in the script data 471 $h->{'sc'} = $s; 472 473 merge_Blocks($h, $b); 474 475 merge_PropertyAliases($h, $pa, $fam); 476 477 merge_PropertyValueAliases($h, $va); 478 479 ($h, $version); 480} 481 482#---------------------------------------------------------------------- 483# Ensure that the version strings in the given hashes (under the key 484# '_version') are compatible. Currently this means they must be 485# identical, with the exception that "X.Y" will match "X.Y.0". 486# All hashes must define the key '_version'. 487# 488# @param a list of pairs of (file name, hash reference) 489# 490# @return the version of all the hashes. Upon return, the '_version' 491# will be removed from all hashes. 492sub check_versions { 493 my $version = ''; 494 my $msg = ''; 495 foreach my $a (@_) { 496 my $name = $a->[0]; 497 my $h = $a->[1]; 498 die "Error: No version found" unless (exists $h->{'_version'}); 499 my $v = $h->{'_version'}; 500 delete $h->{'_version'}; 501 502 # append ".0" if necessary, to standardize to X.Y.Z 503 $v .= '.0' unless ($v =~ /\.\d+\./); 504 $v .= '.0' unless ($v =~ /\.\d+\./); 505 $msg .= "$name = $v\n"; 506 if ($version) { 507 die "Error: Mismatched Unicode versions\n$msg" 508 unless ($version eq $v); 509 } else { 510 $version = $v; 511 } 512 } 513 $version; 514} 515 516#---------------------------------------------------------------------- 517# Make sure the property names in PropertyValueAliases.txt match those 518# in PropertyAliases.txt. 519# 520# @param a hash ref from read_PropertyAliases. 521# @param a hash ref from read_PropertyValueAliases. 522sub check_PropertyValueAliases { 523 my ($pa, $va) = @_; 524 525 # make a reverse hash of short->long 526 my %rev; 527 for (keys %$pa) { $rev{$pa->{$_}} = $_; } 528 529 for my $prop (keys %$va) { 530 if (!exists $rev{$prop} && !isPseudoProperty($prop)) { 531 print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n"; 532 } 533 } 534} 535 536#---------------------------------------------------------------------- 537# Merge blocks data into uchar.h enum data. In the 'blk' subhash all 538# code point values, as returned from read_uchar, are replaced by 539# block names, as read from Blocks.txt and returned by read_Blocks. 540# The match must be 1-to-1. If there is any failure of 1-to-1 541# mapping, an error is signaled. Upon return, the read_Blocks hash 542# is emptied of all contents, except for those that failed to match. 543# 544# The mapping in the 'blk' subhash, after this function returns, is 545# from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h 546# pseudo-name, e.g. "Basic Latin". 547# 548# @param a hash ref from read_uchar. 549# @param a hash ref from read_Blocks. 550sub merge_Blocks { 551 my ($h, $b) = @_; 552 553 die "Error: No blocks data in uchar.h" 554 unless (exists $h->{'blk'}); 555 my $blk = $h->{'blk'}; 556 for my $enum (keys %$blk) { 557 my $cp = $blk->{$enum}; 558 if ($cp && !exists $b->{$cp}) { 559 die "Error: No block found at $cp in Blocks.txt"; 560 } 561 # Convert code point to pseudo-name: 562 $blk->{$enum} = $b->{$cp}; 563 delete $b->{$cp}; 564 } 565 my $err = ''; 566 for my $cp (keys %$b) { 567 $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n"; 568 } 569 die $err if ($err); 570} 571 572#---------------------------------------------------------------------- 573# Merge property alias names into the uchar.h hash. The subhashes 574# under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are 575# examined and the values of those subhashes are assumed to be long 576# names in PropertyAliases.txt. They are validated and replaced by 577# "<short>|<long>". Upon return, the read_PropertyAliases hash is 578# emptied of all contents, except for those that failed to match. 579# Unmatched names in PropertyAliases are listed as a warning but do 580# NOT cause the script to die. 581# 582# @param a hash ref from read_uchar. 583# @param a hash ref from read_PropertyAliases. 584# @param a hash mapping long names to property family (e.g., 'binary') 585sub merge_PropertyAliases { 586 my ($h, $pa, $fam) = @_; 587 588 for my $k (@TOP) { 589 die "Error: No properties data for $k in uchar.h" 590 unless (exists $h->{$k}); 591 } 592 593 for my $subh (map { $h->{$_} } @TOP) { 594 for my $enum (keys %$subh) { 595 my $long_name = $subh->{$enum}; 596 if (!exists $pa->{$long_name}) { 597 die "Error: Property $long_name not found (or used more than once)"; 598 } 599 600 my $value; 601 if($pa->{$long_name} =~ m|^n/a\d*$|) { 602 # replace an "n/a" short name with an empty name (nothing before "|"); 603 # don't remove it (don't remove the "|"): there must always be a long name, 604 # and if the short name is removed, then the long name becomes the 605 # short name and there is no long name left (unless there is another alias) 606 $value = "|" . $long_name; 607 } else { 608 $value = $pa->{$long_name} . "|" . $long_name; 609 } 610 if (exists $additional_property_aliases{$long_name}) { 611 $value .= "|" . $additional_property_aliases{$long_name}; 612 } 613 $subh->{$enum} = $value; 614 delete $pa->{$long_name}; 615 } 616 } 617 618 my @err; 619 for my $name (keys %$pa) { 620 $MISSING_FROM_UCHAR{$pa->{$name}} = 1; 621 if (exists $UNSUPPORTED{$name}) { 622 push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h"; 623 } elsif (!isIgnoredProperty($name)) { 624 push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h"; 625 } 626 } 627 print join("\n", sort @err), "\n" if (@err); 628} 629 630#---------------------------------------------------------------------- 631# Return 1 if two names match ignoring whitespace, '-', and '_'. 632# Used to match names in Blocks.txt with those in PropertyValueAliases.txt 633# as of Unicode 4.0. 634sub matchesLoosely { 635 my ($a, $b) = @_; 636 $a =~ s/[\s\-_]//g; 637 $b =~ s/[\s\-_]//g; 638 $a =~ /^$b$/i; 639} 640 641#---------------------------------------------------------------------- 642# Merge PropertyValueAliases.txt data into the uchar.h hash. All 643# properties other than blk, _bp, and _ep are analyzed and mapped to 644# the names listed in PropertyValueAliases. They are then replaced 645# with a string of the form "<short>|<long>". The short or long name 646# may be missing. 647# 648# @param a hash ref from read_uchar. 649# @param a hash ref from read_PropertyValueAliases. 650sub merge_PropertyValueAliases { 651 my ($h, $va) = @_; 652 653 my %gcCount; 654 for my $prop (keys %$h) { 655 # _bp, _ep handled in merge_PropertyAliases 656 next if ($prop =~ /^_/); 657 658 # Special case: gcm 659 my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop; 660 661 # find corresponding PropertyValueAliases data 662 die "Error: Can't find $prop in PropertyValueAliases.txt" 663 unless (exists $va->{$prop2}); 664 my $pva = $va->{$prop2}; 665 666 # match up data 667 my $hh = $h->{$prop}; 668 for my $enum (keys %$hh) { 669 670 my $name = $hh->{$enum}; 671 672 # look up both long and short & ignore case 673 my $n; 674 if (exists $pva->{$name}) { 675 $n = $name; 676 } else { 677 # iterate (slow) 678 for my $a (keys %$pva) { 679 # case-insensitive match 680 # & case-insensitive reverse match 681 if ($a =~ /^$name$/i || 682 $pva->{$a} =~ /^$name$/i) { 683 $n = $a; 684 last; 685 } 686 } 687 } 688 689 # For blocks, do a loose match from Blocks.txt pseudo-name 690 # to PropertyValueAliases long name. 691 if (!$n && $prop eq 'blk') { 692 for my $a (keys %$pva) { 693 # The block is only going to match the long name, 694 # but we check both for completeness. As of Unicode 695 # 4.0, blocks do not have short names. 696 if (matchesLoosely($name, $pva->{$a}) || 697 matchesLoosely($name, $a)) { 698 $n = $a; 699 last; 700 } 701 } 702 } 703 704 die "Error: Property value $prop:$name not found" unless ($n); 705 706 my $l = $n; 707 my $r = $pva->{$n}; 708 # convert |n/a\d*| to blank 709 $l = '' if ($l =~ m|^n/a\d*$|); 710 $r = '' if ($r =~ m|^n/a\d*$|); 711 712 $hh->{$enum} = "$l|$r"; 713 # Don't delete the 'gc' properties because we need to share 714 # them between 'gc' and 'gcm'. Count each use instead. 715 if ($prop2 eq 'gc') { 716 ++$gcCount{$n}; 717 } else { 718 delete $pva->{$n}; 719 } 720 } 721 } 722 723 # Merge the combining class values in manually 724 # Add the same values to the synthetic lccc and tccc properties 725 die "Error: No ccc data" 726 unless exists $va->{'ccc'}; 727 for my $ccc (keys %{$va->{'ccc'}}) { 728 die "Error: Can't overwrite ccc $ccc" 729 if (exists $h->{'ccc'}->{$ccc}); 730 $h->{'lccc'}->{$ccc} = 731 $h->{'tccc'}->{$ccc} = 732 $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc}; 733 } 734 delete $va->{'ccc'}; 735 736 # Merge synthetic binary property values in manually. 737 # These are the "True" and "False" value aliases. 738 die "Error: No True/False value aliases" 739 unless exists $va->{'binprop'}; 740 for my $bp (keys %{$va->{'binprop'}}) { 741 $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp}; 742 } 743 delete $va->{'binprop'}; 744 745 my $err = ''; 746 for my $prop (sort keys %$va) { 747 my $hh = $va->{$prop}; 748 for my $subkey (sort keys %$hh) { 749 # 'gc' props are shared with 'gcm'; make sure they were used 750 # once or twice. 751 if ($prop eq 'gc') { 752 my $n = $gcCount{$subkey}; 753 next if ($n >= 1 && $n <= 2); 754 } 755 $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n" 756 unless exists $MISSING_FROM_UCHAR{$prop}; 757 } 758 } 759 print $err if ($err); 760} 761 762#---------------------------------------------------------------------- 763# Read the PropertyAliases.txt file. Return a hash that maps the long 764# name to the short name. The special key '_version' will map to the 765# Unicode version of the file. The special key '_family' holds a 766# subhash that maps long names to a family string, for descriptive 767# purposes. 768# 769# @param a filename for PropertyAliases.txt 770# @param reference to hash to receive data. Keys are long names. 771# Values are short names. 772sub read_PropertyAliases { 773 774 my $hash = shift; # result 775 776 my $filename = shift; 777 778 my $fam = {}; # map long names to family string 779 $fam = $hash->{'_family'} if (exists $hash->{'_family'}); 780 781 my $family; # binary, enumerated, etc. 782 783 my $in = new FileHandle($filename, 'r'); 784 die "Error: Cannot open $filename" if (!defined $in); 785 786 while (<$in>) { 787 788 # Read version (embedded in a comment) 789 if (/PropertyAliases-(\d+\.\d+\.\d+)/i) { 790 die "Error: Multiple versions in $filename" 791 if (exists $hash->{'_version'}); 792 $hash->{'_version'} = $1; 793 } 794 795 # Read family heading 796 if (/^\s*\#\s*(.+?)\s*Properties\s*$/) { 797 $family = $1; 798 } 799 800 # Ignore comments and blank lines 801 s/\#.*//; 802 next unless (/\S/); 803 804 if (/^\s*(.+?)\s*;/) { 805 my $short = $1; 806 my @fields = /;\s*([^\s;]+)/g; 807 if (@fields < 1) { 808 my $number = @fields; 809 die "Error: Wrong number of fields ($number) in $filename at $_"; 810 } 811 812 # Make "n/a" strings unique 813 if ($short eq 'n/a') { 814 $short .= sprintf("%03d", $propNA++); 815 } 816 my $long = $fields[0]; 817 if ($long eq 'n/a') { 818 $long .= sprintf("%03d", $propNA++); 819 } 820 821 # Add long name->short name to the hash=pa hash table 822 if (exists $hash->{$long}) { 823 die "Error: Duplicate property $long in $filename" 824 } 825 $hash->{$long} = $short; 826 $fam->{$long} = $family; 827 828 # Add the list of further aliases to the additional_property_aliases hash table, 829 # using the long property name as the key. 830 # For example: 831 # White_Space->space|outer_space 832 if (@fields > 1) { 833 my $value = pop @fields; 834 while (@fields > 1) { 835 $value .= "|" . pop @fields; 836 } 837 $additional_property_aliases{$long} = $value; 838 } 839 } else { 840 die "Error: Can't parse $_ in $filename"; 841 } 842 } 843 844 $in->close(); 845 846 $hash->{'_family'} = $fam; 847} 848 849#---------------------------------------------------------------------- 850# Read the PropertyValueAliases.txt file. Return a two level hash 851# that maps property_short_name:value_short_name:value_long_name. In 852# the case of the 'ccc' property, the short name is the numeric class 853# and the long name is "<short>|<long>". The special key '_version' 854# will map to the Unicode version of the file. 855# 856# @param a filename for PropertyValueAliases.txt 857# 858# @return a hash reference. 859sub read_PropertyValueAliases { 860 861 my $hash = shift; # result 862 863 my $filename = shift; 864 865 my $in = new FileHandle($filename, 'r'); 866 die "Error: Cannot open $filename" if (!defined $in); 867 868 while (<$in>) { 869 870 # Read version (embedded in a comment) 871 if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) { 872 die "Error: Multiple versions in $filename" 873 if (exists $hash->{'_version'}); 874 $hash->{'_version'} = $1; 875 } 876 877 # Ignore comments and blank lines 878 s/\#.*//; 879 next unless (/\S/); 880 881 if (/^\s*(.+?)\s*;/i) { 882 my $prop = $1; 883 my @fields = /;\s*([^\s;]+)/g; 884 die "Error: Wrong number of fields in $filename" 885 if (@fields < 2 || @fields > 5); 886 # Make "n/a" strings unique 887 $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a'); 888 # Squash extra fields together 889 while (@fields > 2) { 890 my $f = pop @fields; 891 $fields[$#fields] .= '|' . $f; 892 } 893 addDatum($hash, $prop, @fields); 894 } 895 896 else { 897 die "Error: Can't parse $_ in $filename"; 898 } 899 } 900 901 $in->close(); 902 903 # Script Copt=Qaac (Coptic) is a special case. 904 # Before the Copt code was defined, the private-use code Qaac was used. 905 # Starting with Unicode 4.1, PropertyValueAliases.txt contains 906 # Copt as the short name as well as Qaac as an alias. 907 # For use with older Unicode data files, we add here a Qaac->Coptic entry. 908 # This should not do anything for 4.1-and-later Unicode data files. 909 # See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/ 910 $hash->{'sc'}->{'Qaac'} = 'Coptic' 911 unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'}); 912 913 # Add N|No|T|True and Y|Yes|F|False -- these are values we recognize for 914 # binary properties (until Unicode 5.0 NOT from PropertyValueAliases.txt). 915 # These are of the same form as the 'ccc' value aliases. 916 # Starting with Unicode 5.1, PropertyValueAliases.txt does have values 917 # for binary properties. 918 if (!exists $hash->{'binprop'}->{'0'}) { 919 if (exists $hash->{'Alpha'}->{'N'}) { 920 # Unicode 5.1 and later: Make the numeric value the key. 921 $hash->{'binprop'}->{'0'} = 'N|' . $hash->{'Alpha'}->{'N'}; 922 $hash->{'binprop'}->{'1'} = 'Y|' . $hash->{'Alpha'}->{'Y'}; 923 } elsif (exists $hash->{'Alpha'}) { 924 die "Error: Unrecognized short value name for binary property 'Alpha'\n"; 925 } else { 926 # Unicode 5.0 and earlier: Add manually. 927 $hash->{'binprop'}->{'0'} = 'N|No|F|False'; 928 $hash->{'binprop'}->{'1'} = 'Y|Yes|T|True'; 929 } 930 } 931} 932 933#---------------------------------------------------------------------- 934# Read the Blocks.txt file. Return a hash that maps the code point 935# range start to the block name. The special key '_version' will map 936# to the Unicode version of the file. 937# 938# As of Unicode 4.0, the names in the Blocks.txt are no longer the 939# proper names. The proper names are now listed in PropertyValueAliases. 940# They are similar but not identical. Furthermore, 4.0 introduces 941# a new block name, No_Block, which is listed only in PropertyValueAliases 942# and not in Blocks.txt. As a result, we handle blocks as follows: 943# 944# 1. Read Blocks.txt to map code point range start to quasi-block name. 945# 2. Add to Blocks.txt a synthetic No Block code point & name: 946# X -> No Block 947# 3. Map quasi-names from Blocks.txt (including No Block) to actual 948# names from PropertyValueAliases. This occurs in 949# merge_PropertyValueAliases. 950# 951# @param a filename for Blocks.txt 952# 953# @return a ref to a hash. Keys are code points, as text, e.g., 954# "1720". Values are pseudo-block names, e.g., "Hanunoo". 955sub read_Blocks { 956 957 my $filename = shift; 958 959 my $hash = {}; # result 960 961 my $in = new FileHandle($filename, 'r'); 962 die "Error: Cannot open $filename" if (!defined $in); 963 964 while (<$in>) { 965 966 # Read version (embedded in a comment) 967 if (/Blocks-(\d+\.\d+\.\d+)/i) { 968 die "Error: Multiple versions in $filename" 969 if (exists $hash->{'_version'}); 970 $hash->{'_version'} = $1; 971 } 972 973 # Ignore comments and blank lines 974 s/\#.*//; 975 next unless (/\S/); 976 977 if (/^([0-9a-f]+)\.\.[0-9a-f]+\s*;\s*(.+?)\s*$/i) { 978 die "Error: Duplicate range $1 in $filename" 979 if (exists $hash->{$1}); 980 $hash->{$1} = $2; 981 } 982 983 else { 984 die "Error: Can't parse $_ in $filename"; 985 } 986 } 987 988 $in->close(); 989 990 # Add pseudo-name for No Block 991 $hash->{'none'} = 'No Block'; 992 993 $hash; 994} 995 996#---------------------------------------------------------------------- 997# Read the uscript.h file and compile a mapping of Unicode symbols to 998# icu4c enum values. 999# 1000# @param a filename for uscript.h 1001# 1002# @return a ref to a hash. The keys of the hash are enum symbols from 1003# uscript.h, and the values are script names. 1004sub read_uscript { 1005 1006 my $filename = shift; 1007 1008 my $mode = ''; # state machine mode and submode 1009 my $submode = ''; 1010 1011 my $last = ''; # for line folding 1012 1013 my $hash = {}; # result 1014 my $key; # first-level key 1015 1016 my $in = new FileHandle($filename, 'r'); 1017 die "Error: Cannot open $filename" if (!defined $in); 1018 1019 while (<$in>) { 1020 # Fold continued lines together 1021 if (/^(.*)\\$/) { 1022 $last = $1; 1023 next; 1024 } elsif ($last) { 1025 $_ = $last . $_; 1026 $last = ''; 1027 } 1028 1029 # Exit all modes here 1030 if ($mode && $mode ne 'DEPRECATED') { 1031 if (/^\s*\}/) { 1032 $mode = ''; 1033 next; 1034 } 1035 } 1036 1037 # Handle individual modes 1038 1039 if ($mode eq 'UScriptCode') { 1040 if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) { 1041 my ($enum, $code) = ($1, $2); 1042 die "Error: Duplicate script $enum" 1043 if (exists $hash->{$enum}); 1044 $hash->{$enum} = $code; 1045 } 1046 } 1047 1048 elsif ($mode eq 'DEPRECATED') { 1049 if (/\s*\#ifdef/) { 1050 die "Error: Nested #ifdef"; 1051 } 1052 elsif (/\s*\#endif/) { 1053 $mode = ''; 1054 } 1055 } 1056 1057 elsif (!$mode) { 1058 if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ || 1059 /^\s*typedef\s+enum\s+(\w+)\s*$/) { 1060 $mode = $1; 1061 #print "Parsing $mode\n"; 1062 } 1063 1064 elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) { 1065 $mode = 'DEPRECATED'; 1066 } 1067 } 1068 } 1069 1070 $in->close(); 1071 1072 $hash; 1073} 1074 1075#---------------------------------------------------------------------- 1076# Read the uchar.h file and compile a mapping of Unicode symbols to 1077# icu4c enum values. 1078# 1079# @param a filename for uchar.h 1080# 1081# @return a ref to a hash. The keys of the hash are '_bp' for binary 1082# properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for 1083# double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk', 1084# 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property 1085# value aliases. The values of the hash are subhashes. The subhashes 1086# have a key of the uchar.h enum symbol, and a value of the alias 1087# string (as listed in PropertyValueAliases.txt). NOTE: The alias 1088# string is whatever alias uchar.h lists. This may be either short or 1089# long, depending on the specific enum. NOTE: For blocks ('blk'), the 1090# value is a hex code point for the start of the associated block. 1091# NOTE: The special key _version will map to the Unicode version of 1092# the file. 1093sub read_uchar { 1094 1095 my $filename = shift; 1096 1097 my $mode = ''; # state machine mode and submode 1098 my $submode = ''; 1099 1100 my $last = ''; # for line folding 1101 1102 my $hash = {}; # result 1103 my $key; # first-level key 1104 1105 my $in = new FileHandle($filename, 'r'); 1106 die "Error: Cannot open $filename" if (!defined $in); 1107 1108 while (<$in>) { 1109 # Fold continued lines together 1110 if (/^(.*)\\$/) { 1111 $last .= $1; 1112 next; 1113 } elsif ($last) { 1114 $_ = $last . $_; 1115 $last = ''; 1116 } 1117 1118 # Exit all modes here 1119 if ($mode && $mode ne 'DEPRECATED') { 1120 if (/^\s*\}/) { 1121 $mode = ''; 1122 next; 1123 } 1124 } 1125 1126 # Handle individual modes 1127 1128 if ($mode eq 'UProperty') { 1129 if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) { 1130 if ($submode) { 1131 addDatum($hash, $key, $1, $submode); 1132 $submode = ''; 1133 } else { 1134 #print "Warning: Ignoring $1\n"; 1135 } 1136 } 1137 1138 elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) { 1139 die "Error: Unmatched tag $submode" if ($submode); 1140 die "Error: Unrecognized UProperty comment: $_" 1141 unless (exists $PROP_TYPE{$1}); 1142 $key = $PROP_TYPE{$1}; 1143 $submode = $2; 1144 } 1145 } 1146 1147 elsif ($mode eq 'UCharCategory') { 1148 if (/^\s*(U_\w+)\s*=/) { 1149 if ($submode) { 1150 addDatum($hash, 'gc', $1, $submode); 1151 $submode = ''; 1152 } else { 1153 #print "Warning: Ignoring $1\n"; 1154 } 1155 } 1156 1157 elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) { 1158 die "Error: Unmatched tag $submode" if ($submode); 1159 $submode = $1; 1160 } 1161 } 1162 1163 elsif ($mode eq 'UCharDirection') { 1164 if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) { 1165 if ($submode) { 1166 addDatum($hash, $key, $1, $submode); 1167 $submode = ''; 1168 } else { 1169 #print "Warning: Ignoring $1\n"; 1170 } 1171 } 1172 1173 elsif (m|/\*\*\s*([A-Z]+)\s|) { 1174 die "Error: Unmatched tag $submode" if ($submode); 1175 $key = 'bc'; 1176 $submode = $1; 1177 } 1178 } 1179 1180 elsif ($mode eq 'UBlockCode') { 1181 if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) { 1182 addDatum($hash, 'blk', $1, $2); 1183 } 1184 } 1185 1186 elsif ($mode eq 'UEastAsianWidth') { 1187 if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) { 1188 addDatum($hash, 'ea', $1, $2); 1189 } 1190 } 1191 1192 elsif ($mode eq 'UDecompositionType') { 1193 if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) { 1194 addDatum($hash, 'dt', $1, $2); 1195 } 1196 } 1197 1198 elsif ($mode eq 'UJoiningType') { 1199 if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) { 1200 addDatum($hash, 'jt', $1, $2); 1201 } 1202 } 1203 1204 elsif ($mode eq 'UJoiningGroup') { 1205 if (/^\s*(U_JG_(\w+))/) { 1206 addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT'); 1207 } 1208 } 1209 1210 elsif ($mode eq 'UGraphemeClusterBreak') { 1211 if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) { 1212 addDatum($hash, 'GCB', $1, $2); 1213 } 1214 } 1215 1216 elsif ($mode eq 'UWordBreakValues') { 1217 if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) { 1218 addDatum($hash, 'WB', $1, $2); 1219 } 1220 } 1221 1222 elsif ($mode eq 'USentenceBreak') { 1223 if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) { 1224 addDatum($hash, 'SB', $1, $2); 1225 } 1226 } 1227 1228 elsif ($mode eq 'ULineBreak') { 1229 if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) { 1230 addDatum($hash, 'lb', $1, $2); 1231 } 1232 } 1233 1234 elsif ($mode eq 'UNumericType') { 1235 if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) { 1236 addDatum($hash, 'nt', $1, $2); 1237 } 1238 } 1239 1240 elsif ($mode eq 'UHangulSyllableType') { 1241 if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) { 1242 addDatum($hash, 'hst', $1, $2); 1243 } 1244 } 1245 1246 elsif ($mode eq 'DEPRECATED') { 1247 if (/\s*\#ifdef/) { 1248 die "Error: Nested #ifdef"; 1249 } 1250 elsif (/\s*\#endif/) { 1251 $mode = ''; 1252 } 1253 } 1254 1255 elsif (!$mode) { 1256 if (/^\s*\#define\s+(\w+)\s+(.+)/) { 1257 # #define $left $right 1258 my ($left, $right) = ($1, $2); 1259 1260 if ($left eq 'U_UNICODE_VERSION') { 1261 my $version = $right; 1262 $version = $1 if ($version =~ /^\"(.*)\"/); 1263 # print "Unicode version: ", $version, "\n"; 1264 die "Error: Multiple versions in $filename" 1265 if (defined $hash->{'_version'}); 1266 $hash->{'_version'} = $version; 1267 } 1268 1269 elsif ($left =~ /U_GC_(\w+?)_MASK/) { 1270 addDatum($hash, 'gcm', $left, $1); 1271 } 1272 } 1273 1274 elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ || 1275 /^\s*typedef\s+enum\s+(\w+)\s*$/) { 1276 $mode = $1; 1277 #print "Parsing $mode\n"; 1278 } 1279 1280 elsif (/^\s*enum\s+(\w+)\s*\{/ || 1281 /^\s*enum\s+(\w+)\s*$/) { 1282 $mode = $1; 1283 #print "Parsing $mode\n"; 1284 } 1285 1286 elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) { 1287 $mode = 'DEPRECATED'; 1288 } 1289 } 1290 } 1291 1292 $in->close(); 1293 1294 # hardcode known values for the normalization quick check properties 1295 # see unorm.h for the UNormalizationCheckResult enum 1296 1297 addDatum($hash, 'NFC_QC', 'UNORM_NO', 'N'); 1298 addDatum($hash, 'NFC_QC', 'UNORM_YES', 'Y'); 1299 addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M'); 1300 1301 addDatum($hash, 'NFKC_QC', 'UNORM_NO', 'N'); 1302 addDatum($hash, 'NFKC_QC', 'UNORM_YES', 'Y'); 1303 addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M'); 1304 1305 # no "maybe" values for NF[K]D 1306 1307 addDatum($hash, 'NFD_QC', 'UNORM_NO', 'N'); 1308 addDatum($hash, 'NFD_QC', 'UNORM_YES', 'Y'); 1309 1310 addDatum($hash, 'NFKD_QC', 'UNORM_NO', 'N'); 1311 addDatum($hash, 'NFKD_QC', 'UNORM_YES', 'Y'); 1312 1313 $hash; 1314} 1315 1316#---------------------------------------------------------------------- 1317# Add a new value to a two-level hash. That is, given a ref to 1318# a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value. 1319sub addDatum { 1320 my ($h, $k1, $k2, $v) = @_; 1321 if (exists $h->{$k1}->{$k2}) { 1322 die "Error: $k1:$k2 already set to " . 1323 $h->{$k1}->{$k2} . ", cannot set to " . $v; 1324 } 1325 $h->{$k1}->{$k2} = $v; 1326} 1327 1328#eof 1329