1f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#!/usr/bin/perl 2f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# ******************************************************************** 3f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# * COPYRIGHT: 4f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# * Copyright (c) 2002-2007, International Business Machines Corporation and 5f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# * others. All Rights Reserved. 6f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# ******************************************************************** 7f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 8f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# regexcst.pl 9f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# Compile the regular expression paser state table data into initialized C data. 10f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# Usage: 11f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# cd icu/source/i18n 12f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# perl regexcst.pl < regexcst.txt > regexcst.h 13f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 14f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# The output file, regexcst.h, is included by some of the .cpp regex 15f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# implementation files. This perl script is NOT run as part 16f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# of a normal ICU build. It is run by hand when needed, and the 17f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# regexcst.h generated file is put back into cvs. 18f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 19f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# See regexcst.txt for a description of the input format for this script. 20f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 21f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# This script is derived from rbbicst.pl, which peforms the same function 22f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# for the Rule Based Break Iterator Rule Parser. Perhaps they could be 23f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# merged? 24f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 25f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 26f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 27f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)$num_states = 1; # Always the state number for the line being compiled. 28f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)$line_num = 0; # The line number in the input file. 29f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 30f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)$states{"pop"} = 255; # Add the "pop" to the list of defined state names. 31f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # This prevents any state from being labelled with "pop", 32f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # and resolves references to "pop" in the next state field. 33f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 34f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)line_loop: while (<>) { 35f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) chomp(); 36f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $line = $_; 37f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) @fields = split(); 38f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $line_num++; 39f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 40f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # Remove # comments, which are any fields beginning with a #, plus all 41f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # that follow on the line. 42f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for ($i=0; $i<@fields; $i++) { 43f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($fields[$i] =~ /^#/) { 44f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) @fields = @fields[0 .. $i-1]; 45f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) last; 46f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 47f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 48f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # ignore blank lines, and those with no fields left after stripping comments.. 49f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (@fields == 0) { 50f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) next; 51f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 52f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 53f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 54f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # State Label: handling. 55f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # Does the first token end with a ":"? If so, it's the name of a state. 56f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # Put in a hash, together with the current state number, 57f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # so that we can later look up the number from the name. 58f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 59f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (@fields[0] =~ /.*:$/) { 60f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $state_name = @fields[0]; 61f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $state_name =~ s/://; # strip off the colon from the state name. 62f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 63f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($states{$state_name} != 0) { 64f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " rbbicst: at line $line-num duplicate definition of state $state_name\n"; 65f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 66f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $states{$state_name} = $num_states; 67f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $stateNames[$num_states] = $state_name; 68f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 69f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # if the label was the only thing on this line, go on to the next line, 70f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # otherwise assume that a state definition is on the same line and fall through. 71f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (@fields == 1) { 72f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) next line_loop; 73f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 74f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) shift @fields; # shift off label field in preparation 75f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # for handling the rest of the line. 76f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 77f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 78f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 79f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # State Transition line. 80f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # syntax is this, 81f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # character [n] target-state [^push-state] [function-name] 82f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # where 83f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # [something] is an optional something 84f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # character is either a single quoted character e.g. '[' 85f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # or a name of a character class, e.g. white_space 86f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 87f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 88f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $state_line_num[$num_states] = $line_num; # remember line number with each state 89f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # so we can make better error messages later. 90f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 91f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # First field, character class or literal character for this transition. 92f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 93f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($fields[0] =~ /^'.'$/) { 94f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # We've got a quoted literal character. 95f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $state_literal_chars[$num_states] = $fields[0]; 96f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $state_literal_chars[$num_states] =~ s/'//g; 97f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 98f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # We've got the name of a character class. 99f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $state_char_class[$num_states] = $fields[0]; 100f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($fields[0] =~ /[\W]/) { 101f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " rbbicsts: at line $line_num, bad character literal or character class name.\n"; 102f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " scanning $fields[0]\n"; 103f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) exit(-1); 104f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 105f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 106f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) shift @fields; 107f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 108f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 109f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # do the 'n' flag 110f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 111f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $state_flag[$num_states] = "FALSE"; 112f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($fields[0] eq "n") { 113f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $state_flag[$num_states] = "TRUE"; 114f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) shift @fields; 115f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 116f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 117f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 118f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # do the destination state. 119f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 120f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $state_dest_state[$num_states] = $fields[0]; 121f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($fields[0] eq "") { 122f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " rbbicsts: at line $line_num, destination state missing.\n"; 123f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) exit(-1); 124f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 125f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) shift @fields; 126f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 127f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 128f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # do the push state, if present. 129f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 130f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($fields[0] =~ /^\^/) { 131f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $fields[0] =~ s/^\^//; 132f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $state_push_state[$num_states] = $fields[0]; 133f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($fields[0] eq "" ) { 134f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n"; 135f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) exit(-1); 136f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 137f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) shift @fields; 138f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 139f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 140f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 141f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # Lastly, do the optional action name. 142f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 143f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($fields[0] ne "") { 144f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $state_func_name[$num_states] = $fields[0]; 145f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) shift @fields; 146f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 147f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 148f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 149f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # There should be no fields left on the line at this point. 150f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # 151f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (@fields > 0) { 152f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n"; 153f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " scanning $fields[0]\n"; 154f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 155f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $num_states++; 156f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 157f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 158f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 159f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# We've read in the whole file, now go back and output the 160f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# C source code for the state transition table. 161f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 162f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# We read all states first, before writing anything, so that the state numbers 163f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# for the destination states are all available to be written. 164f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 165f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 166f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 167f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# Make hashes for the names of the character classes and 168f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# for the names of the actions that appeared. 169f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 170f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)for ($state=1; $state < $num_states; $state++) { 171f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($state_char_class[$state] ne "") { 172f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($charClasses{$state_char_class[$state]} == 0) { 173f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $charClasses{$state_char_class[$state]} = 1; 174f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 175f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 176f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($state_func_name[$state] eq "") { 177f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $state_func_name[$state] = "doNOP"; 178f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 179f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($actions{$state_action_name[$state]} == 0) { 180f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $actions{$state_func_name[$state]} = 1; 181f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 182f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 183f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 184f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 185f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# Check that all of the destination states have been defined 186f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 187f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 188f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)$states{"exit"} = 0; # Predefined state name, terminates state machine. 189f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)for ($state=1; $state<$num_states; $state++) { 190f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") { 191f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n"; 192f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $errors++; 193f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 194f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) { 195f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n"; 196f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $errors++; 197f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 198f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 199f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 200f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)die if ($errors>0); 201f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 202f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "//---------------------------------------------------------------------------------\n"; 203f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "//\n"; 204f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "// Generated Header File. Do not edit by hand.\n"; 205f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "// This file contains the state table for the ICU Regular Expression Pattern Parser\n"; 206f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "// It is generated by the Perl script \"regexcst.pl\" from\n"; 207f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "// the rule parser state definitions file \"regexcst.txt\".\n"; 208f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "//\n"; 209f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "// Copyright (C) 2002-2007 International Business Machines Corporation \n"; 210f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "// and others. All rights reserved. \n"; 211f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "//\n"; 212f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "//---------------------------------------------------------------------------------\n"; 213f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "#ifndef RBBIRPT_H\n"; 214f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "#define RBBIRPT_H\n"; 215f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "\n"; 216f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "U_NAMESPACE_BEGIN\n"; 217f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 218f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 219f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# Emit the constants for indicies of Unicode Sets 220f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# Define one constant for each of the character classes encountered. 221f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# At the same time, store the index corresponding to the set name back into hash. 222f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 223f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "//\n"; 224f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "// Character classes for regex pattern scanning.\n"; 225f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "//\n"; 226f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)$i = 128; # State Table values for Unicode char sets range from 128-250. 227f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # Sets "default", "quoted", etc. get special handling. 228f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # They have no corresponding UnicodeSet object in the state machine, 229f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # but are handled by special case code. So we emit no reference 230f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # to a UnicodeSet object to them here. 231f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)foreach $setName (keys %charClasses) { 232f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($setName eq "default") { 233f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $charClasses{$setName} = 255;} 234f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) elsif ($setName eq "quoted") { 235f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $charClasses{$setName} = 254;} 236f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) elsif ($setName eq "eof") { 237f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $charClasses{$setName} = 253;} 238f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) else { 239f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine. 240f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " static const uint8_t kRuleSet_$setName = $i;\n"; 241f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $charClasses{$setName} = $i; 242f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $i++; 243f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 244f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 245f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "\n\n"; 246f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 247f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 248f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# Emit the enum for the actions to be performed. 249f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 250f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "enum Regex_PatternParseAction {\n"; 251f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)foreach $act (keys %actions) { 252f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " $act,\n"; 253f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 254f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print " rbbiLastAction};\n\n"; 255f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 256f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 257f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# Emit the struct definition for transtion table elements. 258f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 259f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "//-------------------------------------------------------------------------------\n"; 260f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "//\n"; 261f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "// RegexTableEl represents the structure of a row in the transition table\n"; 262f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "// for the pattern parser state machine.\n"; 263f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "//-------------------------------------------------------------------------------\n"; 264f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "struct RegexTableEl {\n"; 265f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print " Regex_PatternParseAction fAction;\n"; 266f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print " uint8_t fCharClass; // 0-127: an individual ASCII character\n"; 267f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print " // 128-255: character class index\n"; 268f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print " uint8_t fNextState; // 0-250: normal next-state numbers\n"; 269f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print " // 255: pop next-state from stack.\n"; 270f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print " uint8_t fPushState;\n"; 271f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print " UBool fNextChar;\n"; 272f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "};\n\n"; 273f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 274f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 275f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# emit the state transition table 276f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 277f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "static const struct RegexTableEl gRuleParseStateTable[] = {\n"; 278f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1. 279f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)for ($state=1; $state < $num_states; $state++) { 280f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " , {$state_func_name[$state],"; 281f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($state_literal_chars[$state] ne "") { 282f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) $c = $state_literal_chars[$state]; 283f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok. 284f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) }else { 285f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " $charClasses{$state_char_class[$state]},"; 286f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 287f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " $states{$state_dest_state[$state]},"; 288f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 289f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # The push-state field is optional. If omitted, fill field with a zero, which flags 290f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # the state machine that there is no push state. 291f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($state_push_state[$state] eq "") { 292f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print "0, "; 293f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 294f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " $states{$state_push_state[$state]},"; 295f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 296f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " $state_flag[$state]} "; 297f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 298f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # Put out a C++ comment showing the number (index) of this state row, 299f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) # and, if this is the first row of the table for this state, the state name. 300f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " // $state "; 301f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($stateNames[$state] ne "") { 302f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " $stateNames[$state]"; 303f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 304f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print "\n"; 305f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}; 306f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print " };\n"; 307f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 308f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 309f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 310f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# emit a mapping array from state numbers to state names. 311f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 312f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# This array is used for producing debugging output from the pattern parser. 313f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# 314f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "static const char * const RegexStateNames[] = {"; 315f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)for ($state=0; $state<$num_states; $state++) { 316f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ($stateNames[$state] ne "") { 317f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " \"$stateNames[$state]\",\n"; 318f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 319f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) print " 0,\n"; 320f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 321f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 322f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print " 0};\n\n"; 323f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 324f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "U_NAMESPACE_END\n"; 325f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)print "#endif\n"; 326f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 327f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 328f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 329