1#!/usr/bin/perl 2# ******************************************************************** 3# * COPYRIGHT: 4# * Copyright (c) 2002-2015, International Business Machines Corporation and 5# * others. All Rights Reserved. 6# ******************************************************************** 7# 8# regexcst.pl 9# Compile the regular expression paser state table data into initialized C data. 10# Usage: 11# cd icu/source/i18n 12# perl regexcst.pl < regexcst.txt > regexcst.h 13# 14# The output file, regexcst.h, is included by some of the .cpp regex 15# implementation files. This perl script is NOT run as part 16# of a normal ICU build. It is run by hand when needed, and the 17# regexcst.h generated file is put back into cvs. 18# 19# See regexcst.txt for a description of the input format for this script. 20# 21# This script is derived from rbbicst.pl, which peforms the same function 22# for the Rule Based Break Iterator Rule Parser. Perhaps they could be 23# merged? 24# 25 26 27$num_states = 1; # Always the state number for the line being compiled. 28$line_num = 0; # The line number in the input file. 29 30$states{"pop"} = 255; # Add the "pop" to the list of defined state names. 31 # This prevents any state from being labelled with "pop", 32 # and resolves references to "pop" in the next state field. 33 34line_loop: while (<>) { 35 chomp(); 36 $line = $_; 37 @fields = split(); 38 $line_num++; 39 40 # Remove # comments, which are any fields beginning with a #, plus all 41 # that follow on the line. 42 for ($i=0; $i<@fields; $i++) { 43 if ($fields[$i] =~ /^#/) { 44 @fields = @fields[0 .. $i-1]; 45 last; 46 } 47 } 48 # ignore blank lines, and those with no fields left after stripping comments.. 49 if (@fields == 0) { 50 next; 51 } 52 53 # 54 # State Label: handling. 55 # Does the first token end with a ":"? If so, it's the name of a state. 56 # Put in a hash, together with the current state number, 57 # so that we can later look up the number from the name. 58 # 59 if (@fields[0] =~ /.*:$/) { 60 $state_name = @fields[0]; 61 $state_name =~ s/://; # strip off the colon from the state name. 62 63 if ($states{$state_name} != 0) { 64 print " rbbicst: at line $line-num duplicate definition of state $state_name\n"; 65 } 66 $states{$state_name} = $num_states; 67 $stateNames[$num_states] = $state_name; 68 69 # if the label was the only thing on this line, go on to the next line, 70 # otherwise assume that a state definition is on the same line and fall through. 71 if (@fields == 1) { 72 next line_loop; 73 } 74 shift @fields; # shift off label field in preparation 75 # for handling the rest of the line. 76 } 77 78 # 79 # State Transition line. 80 # syntax is this, 81 # character [n] target-state [^push-state] [function-name] 82 # where 83 # [something] is an optional something 84 # character is either a single quoted character e.g. '[' 85 # or a name of a character class, e.g. white_space 86 # 87 88 $state_line_num[$num_states] = $line_num; # remember line number with each state 89 # so we can make better error messages later. 90 # 91 # First field, character class or literal character for this transition. 92 # 93 if ($fields[0] =~ /^'.'$/) { 94 # We've got a quoted literal character. 95 $state_literal_chars[$num_states] = $fields[0]; 96 $state_literal_chars[$num_states] =~ s/'//g; 97 } else { 98 # We've got the name of a character class. 99 $state_char_class[$num_states] = $fields[0]; 100 if ($fields[0] =~ /[\W]/) { 101 print " rbbicsts: at line $line_num, bad character literal or character class name.\n"; 102 print " scanning $fields[0]\n"; 103 exit(-1); 104 } 105 } 106 shift @fields; 107 108 # 109 # do the 'n' flag 110 # 111 $state_flag[$num_states] = "FALSE"; 112 if ($fields[0] eq "n") { 113 $state_flag[$num_states] = "TRUE"; 114 shift @fields; 115 } 116 117 # 118 # do the destination state. 119 # 120 $state_dest_state[$num_states] = $fields[0]; 121 if ($fields[0] eq "") { 122 print " rbbicsts: at line $line_num, destination state missing.\n"; 123 exit(-1); 124 } 125 shift @fields; 126 127 # 128 # do the push state, if present. 129 # 130 if ($fields[0] =~ /^\^/) { 131 $fields[0] =~ s/^\^//; 132 $state_push_state[$num_states] = $fields[0]; 133 if ($fields[0] eq "" ) { 134 print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n"; 135 exit(-1); 136 } 137 shift @fields; 138 } 139 140 # 141 # Lastly, do the optional action name. 142 # 143 if ($fields[0] ne "") { 144 $state_func_name[$num_states] = $fields[0]; 145 shift @fields; 146 } 147 148 # 149 # There should be no fields left on the line at this point. 150 # 151 if (@fields > 0) { 152 print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n"; 153 print " scanning $fields[0]\n"; 154 } 155 $num_states++; 156} 157 158# 159# We've read in the whole file, now go back and output the 160# C source code for the state transition table. 161# 162# We read all states first, before writing anything, so that the state numbers 163# for the destination states are all available to be written. 164# 165 166# 167# Make hashes for the names of the character classes and 168# for the names of the actions that appeared. 169# 170for ($state=1; $state < $num_states; $state++) { 171 if ($state_char_class[$state] ne "") { 172 if ($charClasses{$state_char_class[$state]} == 0) { 173 $charClasses{$state_char_class[$state]} = 1; 174 } 175 } 176 if ($state_func_name[$state] eq "") { 177 $state_func_name[$state] = "doNOP"; 178 } 179 if ($actions{$state_action_name[$state]} == 0) { 180 $actions{$state_func_name[$state]} = 1; 181 } 182} 183 184# 185# Check that all of the destination states have been defined 186# 187# 188$states{"exit"} = 0; # Predefined state name, terminates state machine. 189for ($state=1; $state<$num_states; $state++) { 190 if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") { 191 print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n"; 192 $errors++; 193 } 194 if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) { 195 print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n"; 196 $errors++; 197 } 198} 199 200die if ($errors>0); 201 202print "//---------------------------------------------------------------------------------\n"; 203print "//\n"; 204print "// Generated Header File. Do not edit by hand.\n"; 205print "// This file contains the state table for the ICU Regular Expression Pattern Parser\n"; 206print "// It is generated by the Perl script \"regexcst.pl\" from\n"; 207print "// the rule parser state definitions file \"regexcst.txt\".\n"; 208print "//\n"; 209print "// Copyright (C) 2002-2015 International Business Machines Corporation \n"; 210print "// and others. All rights reserved. \n"; 211print "//\n"; 212print "//---------------------------------------------------------------------------------\n"; 213print "#ifndef RBBIRPT_H\n"; 214print "#define RBBIRPT_H\n"; 215print "\n"; 216print "U_NAMESPACE_BEGIN\n"; 217 218# 219# Emit the constants for indicies of Unicode Sets 220# Define one constant for each of the character classes encountered. 221# At the same time, store the index corresponding to the set name back into hash. 222# 223print "//\n"; 224print "// Character classes for regex pattern scanning.\n"; 225print "//\n"; 226$i = 128; # State Table values for Unicode char sets range from 128-250. 227 # Sets "default", "quoted", etc. get special handling. 228 # They have no corresponding UnicodeSet object in the state machine, 229 # but are handled by special case code. So we emit no reference 230 # to a UnicodeSet object to them here. 231foreach $setName (keys %charClasses) { 232 if ($setName eq "default") { 233 $charClasses{$setName} = 255;} 234 elsif ($setName eq "quoted") { 235 $charClasses{$setName} = 254;} 236 elsif ($setName eq "eof") { 237 $charClasses{$setName} = 253;} 238 else { 239 # Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine. 240 print " static const uint8_t kRuleSet_$setName = $i;\n"; 241 $charClasses{$setName} = $i; 242 $i++; 243 } 244} 245print "\n\n"; 246 247# 248# Emit the enum for the actions to be performed. 249# 250print "enum Regex_PatternParseAction {\n"; 251foreach $act (keys %actions) { 252 print " $act,\n"; 253} 254print " rbbiLastAction};\n\n"; 255 256# 257# Emit the struct definition for transtion table elements. 258# 259print "//-------------------------------------------------------------------------------\n"; 260print "//\n"; 261print "// RegexTableEl represents the structure of a row in the transition table\n"; 262print "// for the pattern parser state machine.\n"; 263print "//-------------------------------------------------------------------------------\n"; 264print "struct RegexTableEl {\n"; 265print " Regex_PatternParseAction fAction;\n"; 266print " uint8_t fCharClass; // 0-127: an individual ASCII character\n"; 267print " // 128-255: character class index\n"; 268print " uint8_t fNextState; // 0-250: normal next-state numbers\n"; 269print " // 255: pop next-state from stack.\n"; 270print " uint8_t fPushState;\n"; 271print " UBool fNextChar;\n"; 272print "};\n\n"; 273 274# 275# emit the state transition table 276# 277print "static const struct RegexTableEl gRuleParseStateTable[] = {\n"; 278print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1. 279for ($state=1; $state < $num_states; $state++) { 280 print " , {$state_func_name[$state],"; 281 if ($state_literal_chars[$state] ne "") { 282 $c = $state_literal_chars[$state]; 283 printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok. 284 }else { 285 print " $charClasses{$state_char_class[$state]},"; 286 } 287 print " $states{$state_dest_state[$state]},"; 288 289 # The push-state field is optional. If omitted, fill field with a zero, which flags 290 # the state machine that there is no push state. 291 if ($state_push_state[$state] eq "") { 292 print "0, "; 293 } else { 294 print " $states{$state_push_state[$state]},"; 295 } 296 print " $state_flag[$state]} "; 297 298 # Put out a C++ comment showing the number (index) of this state row, 299 # and, if this is the first row of the table for this state, the state name. 300 print " // $state "; 301 if ($stateNames[$state] ne "") { 302 print " $stateNames[$state]"; 303 } 304 print "\n"; 305}; 306print " };\n"; 307 308 309# 310# emit a mapping array from state numbers to state names. 311# 312# This array is used for producing debugging output from the pattern parser. 313# 314print "static const char * const RegexStateNames[] = {"; 315for ($state=0; $state<$num_states; $state++) { 316 if ($stateNames[$state] ne "") { 317 print " \"$stateNames[$state]\",\n"; 318 } else { 319 print " 0,\n"; 320 } 321} 322print " 0};\n\n"; 323 324print "U_NAMESPACE_END\n"; 325print "#endif\n"; 326 327 328 329