1c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#!/usr/bin/perl 2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# ******************************************************************** 3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# * COPYRIGHT: 4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# * Copyright (c) 2002-2007, International Business Machines Corporation and 5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# * others. All Rights Reserved. 6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# ******************************************************************** 7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# regexcst.pl 9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# Compile the regular expression paser state table data into initialized C data. 10c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# Usage: 11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# cd icu/source/i18n 12c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# perl regexcst.pl < regexcst.txt > regexcst.h 13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 14c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# The output file, regexcst.h, is included by some of the .cpp regex 15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# implementation files. This perl script is NOT run as part 16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# of a normal ICU build. It is run by hand when needed, and the 17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# regexcst.h generated file is put back into cvs. 18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# See regexcst.txt for a description of the input format for this script. 20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# This script is derived from rbbicst.pl, which peforms the same function 22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# for the Rule Based Break Iterator Rule Parser. Perhaps they could be 23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# merged? 24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott$num_states = 1; # Always the state number for the line being compiled. 28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott$line_num = 0; # The line number in the input file. 29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott$states{"pop"} = 255; # Add the "pop" to the list of defined state names. 31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # This prevents any state from being labelled with "pop", 32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # and resolves references to "pop" in the next state field. 33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottline_loop: while (<>) { 35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott chomp(); 36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $line = $_; 37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott @fields = split(); 38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $line_num++; 39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # Remove # comments, which are any fields beginning with a #, plus all 41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # that follow on the line. 42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for ($i=0; $i<@fields; $i++) { 43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($fields[$i] =~ /^#/) { 44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott @fields = @fields[0 .. $i-1]; 45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott last; 46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # ignore blank lines, and those with no fields left after stripping comments.. 49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (@fields == 0) { 50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott next; 51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # State Label: handling. 55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # Does the first token end with a ":"? If so, it's the name of a state. 56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # Put in a hash, together with the current state number, 57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # so that we can later look up the number from the name. 58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (@fields[0] =~ /.*:$/) { 60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $state_name = @fields[0]; 61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $state_name =~ s/://; # strip off the colon from the state name. 62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($states{$state_name} != 0) { 64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " rbbicst: at line $line-num duplicate definition of state $state_name\n"; 65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $states{$state_name} = $num_states; 67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $stateNames[$num_states] = $state_name; 68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # if the label was the only thing on this line, go on to the next line, 70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # otherwise assume that a state definition is on the same line and fall through. 71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (@fields == 1) { 72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott next line_loop; 73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott shift @fields; # shift off label field in preparation 75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # for handling the rest of the line. 76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # State Transition line. 80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # syntax is this, 81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # character [n] target-state [^push-state] [function-name] 82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # where 83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # [something] is an optional something 84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # character is either a single quoted character e.g. '[' 85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # or a name of a character class, e.g. white_space 86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $state_line_num[$num_states] = $line_num; # remember line number with each state 89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # so we can make better error messages later. 90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # First field, character class or literal character for this transition. 92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($fields[0] =~ /^'.'$/) { 94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # We've got a quoted literal character. 95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $state_literal_chars[$num_states] = $fields[0]; 96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $state_literal_chars[$num_states] =~ s/'//g; 97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # We've got the name of a character class. 99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $state_char_class[$num_states] = $fields[0]; 100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($fields[0] =~ /[\W]/) { 101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " rbbicsts: at line $line_num, bad character literal or character class name.\n"; 102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " scanning $fields[0]\n"; 103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott exit(-1); 104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott shift @fields; 107c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 108c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 109c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # do the 'n' flag 110c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $state_flag[$num_states] = "FALSE"; 112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($fields[0] eq "n") { 113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $state_flag[$num_states] = "TRUE"; 114c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott shift @fields; 115c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 116c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 117c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # do the destination state. 119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $state_dest_state[$num_states] = $fields[0]; 121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($fields[0] eq "") { 122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " rbbicsts: at line $line_num, destination state missing.\n"; 123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott exit(-1); 124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott shift @fields; 126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # do the push state, if present. 129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($fields[0] =~ /^\^/) { 131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $fields[0] =~ s/^\^//; 132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $state_push_state[$num_states] = $fields[0]; 133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($fields[0] eq "" ) { 134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n"; 135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott exit(-1); 136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott shift @fields; 138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 141c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # Lastly, do the optional action name. 142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($fields[0] ne "") { 144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $state_func_name[$num_states] = $fields[0]; 145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott shift @fields; 146c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # There should be no fields left on the line at this point. 150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # 151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (@fields > 0) { 152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n"; 153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " scanning $fields[0]\n"; 154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $num_states++; 156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# We've read in the whole file, now go back and output the 160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# C source code for the state transition table. 161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# We read all states first, before writing anything, so that the state numbers 163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# for the destination states are all available to be written. 164c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 165c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 166c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 167c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# Make hashes for the names of the character classes and 168c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# for the names of the actions that appeared. 169c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 170c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottfor ($state=1; $state < $num_states; $state++) { 171c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($state_char_class[$state] ne "") { 172c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($charClasses{$state_char_class[$state]} == 0) { 173c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $charClasses{$state_char_class[$state]} = 1; 174c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 175c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 176c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($state_func_name[$state] eq "") { 177c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $state_func_name[$state] = "doNOP"; 178c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 179c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($actions{$state_action_name[$state]} == 0) { 180c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $actions{$state_func_name[$state]} = 1; 181c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 182c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 183c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 184c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 185c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# Check that all of the destination states have been defined 186c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 187c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 188c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott$states{"exit"} = 0; # Predefined state name, terminates state machine. 189c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottfor ($state=1; $state<$num_states; $state++) { 190c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") { 191c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n"; 192c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $errors++; 193c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 194c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) { 195c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n"; 196c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $errors++; 197c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 198c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 199c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 200c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottdie if ($errors>0); 201c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 202c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "//---------------------------------------------------------------------------------\n"; 203c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "//\n"; 204c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "// Generated Header File. Do not edit by hand.\n"; 205c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "// This file contains the state table for the ICU Regular Expression Pattern Parser\n"; 206c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "// It is generated by the Perl script \"regexcst.pl\" from\n"; 207c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "// the rule parser state definitions file \"regexcst.txt\".\n"; 208c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "//\n"; 209c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "// Copyright (C) 2002-2007 International Business Machines Corporation \n"; 210c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "// and others. All rights reserved. \n"; 211c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "//\n"; 212c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "//---------------------------------------------------------------------------------\n"; 213c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "#ifndef RBBIRPT_H\n"; 214c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "#define RBBIRPT_H\n"; 215c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "\n"; 216c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "U_NAMESPACE_BEGIN\n"; 217c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 218c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 219c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# Emit the constants for indicies of Unicode Sets 220c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# Define one constant for each of the character classes encountered. 221c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# At the same time, store the index corresponding to the set name back into hash. 222c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 223c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "//\n"; 224c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "// Character classes for regex pattern scanning.\n"; 225c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "//\n"; 226c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott$i = 128; # State Table values for Unicode char sets range from 128-250. 227c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # Sets "default", "quoted", etc. get special handling. 228c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # They have no corresponding UnicodeSet object in the state machine, 229c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # but are handled by special case code. So we emit no reference 230c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # to a UnicodeSet object to them here. 231c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottforeach $setName (keys %charClasses) { 232c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($setName eq "default") { 233c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $charClasses{$setName} = 255;} 234c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott elsif ($setName eq "quoted") { 235c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $charClasses{$setName} = 254;} 236c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott elsif ($setName eq "eof") { 237c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $charClasses{$setName} = 253;} 238c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott else { 239c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine. 240c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " static const uint8_t kRuleSet_$setName = $i;\n"; 241c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $charClasses{$setName} = $i; 242c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $i++; 243c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 244c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 245c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "\n\n"; 246c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 247c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 248c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# Emit the enum for the actions to be performed. 249c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 250c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "enum Regex_PatternParseAction {\n"; 251c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottforeach $act (keys %actions) { 252c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " $act,\n"; 253c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 254c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint " rbbiLastAction};\n\n"; 255c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 256c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 257c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# Emit the struct definition for transtion table elements. 258c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 259c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "//-------------------------------------------------------------------------------\n"; 260c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "//\n"; 261c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "// RegexTableEl represents the structure of a row in the transition table\n"; 262c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "// for the pattern parser state machine.\n"; 263c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "//-------------------------------------------------------------------------------\n"; 264c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "struct RegexTableEl {\n"; 265c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint " Regex_PatternParseAction fAction;\n"; 266c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint " uint8_t fCharClass; // 0-127: an individual ASCII character\n"; 267c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint " // 128-255: character class index\n"; 268c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint " uint8_t fNextState; // 0-250: normal next-state numbers\n"; 269c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint " // 255: pop next-state from stack.\n"; 270c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint " uint8_t fPushState;\n"; 271c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint " UBool fNextChar;\n"; 272c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "};\n\n"; 273c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 274c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 275c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# emit the state transition table 276c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 277c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "static const struct RegexTableEl gRuleParseStateTable[] = {\n"; 278c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1. 279c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottfor ($state=1; $state < $num_states; $state++) { 280c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " , {$state_func_name[$state],"; 281c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($state_literal_chars[$state] ne "") { 282c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott $c = $state_literal_chars[$state]; 283c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok. 284c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott }else { 285c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " $charClasses{$state_char_class[$state]},"; 286c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 287c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " $states{$state_dest_state[$state]},"; 288c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 289c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # The push-state field is optional. If omitted, fill field with a zero, which flags 290c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # the state machine that there is no push state. 291c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($state_push_state[$state] eq "") { 292c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print "0, "; 293c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 294c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " $states{$state_push_state[$state]},"; 295c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 296c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " $state_flag[$state]} "; 297c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 298c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # Put out a C++ comment showing the number (index) of this state row, 299c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott # and, if this is the first row of the table for this state, the state name. 300c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " // $state "; 301c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($stateNames[$state] ne "") { 302c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " $stateNames[$state]"; 303c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 304c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print "\n"; 305c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}; 306c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint " };\n"; 307c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 308c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 309c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 310c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# emit a mapping array from state numbers to state names. 311c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 312c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# This array is used for producing debugging output from the pattern parser. 313c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott# 314c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "static const char * const RegexStateNames[] = {"; 315c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottfor ($state=0; $state<$num_states; $state++) { 316c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if ($stateNames[$state] ne "") { 317c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " \"$stateNames[$state]\",\n"; 318c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 319c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott print " 0,\n"; 320c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 321c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 322c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint " 0};\n\n"; 323c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 324c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "U_NAMESPACE_END\n"; 325c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottprint "#endif\n"; 326c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 327c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 328c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 329