1#!/usr/bin/perl 2 3# 4#//===----------------------------------------------------------------------===// 5#// 6#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 7#// See https://llvm.org/LICENSE.txt for license information. 8#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 9#// 10#//===----------------------------------------------------------------------===// 11# 12 13use strict; 14use warnings; 15 16use File::Glob ":glob"; 17use Encode qw{ encode }; 18 19use FindBin; 20use lib "$FindBin::Bin/lib"; 21 22use tools; 23 24our $VERSION = "0.04"; 25my $escape = qr{%}; 26my $placeholder = qr{(\d)\$(s|l?[du])}; 27my $target_os; 28 29my $sections = 30 { 31 meta => { short => "prp" }, # "prp" stands for "property". 32 strings => { short => "str" }, 33 formats => { short => "fmt" }, 34 messages => { short => "msg" }, 35 hints => { short => "hnt" }, 36 }; 37my @sections = qw{ meta strings formats messages hints }; 38# Assign section properties: long name, set number, base number. 39map( $sections->{ $sections[ $_ ] }->{ long } = $sections[ $_ ], ( 0 .. @sections - 1 ) ); 40map( $sections->{ $sections[ $_ ] }->{ set } = ( $_ + 1 ), ( 0 .. @sections - 1 ) ); 41map( $sections->{ $sections[ $_ ] }->{ base } = ( ( $_ + 1 ) << 16 ), ( 0 .. @sections - 1 ) ); 42 43# Properties of Meta section. 44my @properties = qw{ Language Country LangId Version Revision }; 45 46 47sub _generate_comment($$$) { 48 49 my ( $data, $open, $close ) = @_; 50 my $bulk = 51 $open . " Do not edit this file! " . $close . "\n" . 52 $open . " The file was generated from " . get_file( $data->{ "%meta" }->{ source } ) . 53 " by " . $tool . " on " . localtime() . ". " . $close . "\n"; 54 return $bulk; 55 56}; # sub _generate_comment 57 58 59sub msg2sgn($) { 60 61 # Convert message string to signature. Signature is a list of placeholders in sorted order. 62 # For example, signature of "%1$s value \"%2$s\" is invalid." is "%1$s %2$s". 63 64 my ( $msg ) = @_; 65 my @placeholders; 66 pos( $msg ) = 0; 67 while ( $msg =~ m{\G.*?$escape$placeholder}g ) { 68 $placeholders[ $1 - 1 ] = "%$1\$$2"; 69 }; # while 70 for ( my $i = 1; $i <= @placeholders; ++ $i ) { 71 if ( not defined( $placeholders[ $i - 1 ] ) ) { 72 $placeholders[ $i - 1 ] = "%$i\$-"; 73 }; # if 74 }; # for $i 75 return join( " ", @placeholders ); 76 77}; # sub msg2sgn 78 79 80sub msg2src($) { 81 82 # Convert message string to a C string constant. 83 84 my ( $msg ) = @_; 85 if ( $target_os eq "win" ) { 86 $msg =~ s{$escape$placeholder}{\%$1!$2!}g; 87 }; # if 88 return $msg; 89 90}; # sub msg2src 91 92 93my $special = 94 { 95 "n" => "\n", 96 "t" => "\t", 97 }; 98 99sub msg2mc($) { 100 my ( $msg ) = @_; 101 $msg = msg2src( $msg ); # Get windows style placeholders. 102 $msg =~ s{\\(.)}{ exists( $special->{ $1 } ) ? $special->{ $1 } : $1 }ge; 103 return $msg; 104}; # sub msg2mc 105 106 107 108sub parse_message($) { 109 110 my ( $msg ) = @_; 111 pos( $msg ) = 0; 112 for ( ; ; ) { 113 if ( $msg !~ m{\G.*?$escape}gc ) { 114 last; 115 } 116 if ( $msg !~ m{\G$placeholder}gc ) { 117 return "Bad %-sequence near \"%" . substr( $msg, pos( $msg ), 7 ) . "\""; 118 }; # if 119 }; # forever 120 return undef; 121 122}; # sub parse_message 123 124 125sub parse_source($) { 126 127 my ( $name ) = @_; 128 129 my @bulk = read_file( $name, -layer => ":utf8" ); 130 my $data = {}; 131 132 my $line; 133 my $n = 0; # Line number. 134 my $obsolete = 0; # Counter of obsolete entries. 135 my $last_idx; 136 my %idents; 137 my $section; 138 139 my $error = 140 sub { 141 my ( $n, $line, $msg ) = @_; 142 runtime_error( "Error parsing $name line $n: " . "$msg:\n" . " $line" ); 143 }; # sub 144 145 foreach $line ( @bulk ) { 146 ++ $n; 147 # Skip empty lines and comments. 148 if ( $line =~ m{\A\s*(\n|#)} ) { 149 $last_idx = undef; 150 next; 151 }; # if 152 # Parse section header. 153 if ( $line =~ m{\A-\*-\s*([A-Z_]*)\s*-\*-\s*\n\z}i ) { 154 $section = ( lc( $1 ) ); 155 if ( not grep( $section eq $_, @sections ) ) { 156 $error->( $n, $line, "Unknown section \"$section\" specified" ); 157 }; # if 158 if ( exists( $data->{ $section } ) ) { 159 $error->( $n, $line, "Multiple sections of the same type specified" ); 160 }; # if 161 %idents = (); # Clean list of known message identifiers. 162 next; 163 }; # if 164 if ( not defined( $section ) ) { 165 $error->( $n, $line, "Section heading expected" ); 166 }; # if 167 # Parse section body. 168 if ( $section eq "meta" ) { 169 if ( $line =~ m{\A([A-Z_][A-Z_0-9]*)\s+"(.*)"\s*?\n?\z}i ) { 170 # Parse meta properties (such as Language, Country, and LangId). 171 my ( $property, $value ) = ( $1, $2 ); 172 if ( not grep( $_ eq $property , @properties ) ) { 173 $error->( $n, $line, "Unknown property \"$property\" specified" ); 174 }; # if 175 if ( exists( $data->{ "%meta" }->{ $property } ) ) { 176 $error->( $n, $line, "Property \"$property\" has already been specified" ); 177 }; # if 178 $data->{ "%meta" }->{ $property } = $value; 179 $last_idx = undef; 180 next; 181 }; # if 182 $error->( $n, $line, "Property line expected" ); 183 }; # if 184 # Parse message. 185 if ( $line =~ m{\A([A-Z_][A-Z_0-9]*)\s+"(.*)"\s*?\n?\z}i ) { 186 my ( $ident, $message ) = ( $1, $2 ); 187 if ( $ident eq "OBSOLETE" ) { 188 # If id is "OBSOLETE", add a unique suffix. It provides convenient way to mark 189 # obsolete messages. 190 ++ $obsolete; 191 $ident .= $obsolete; 192 }; # if 193 if ( exists( $idents{ $ident } ) ) { 194 $error->( $n, $line, "Identifier \"$ident\" is redefined" ); 195 }; # if 196 # Check %-sequences. 197 my $err = parse_message( $message ); 198 if ( $err ) { 199 $error->( $n, $line, $err ); 200 }; # if 201 # Save message. 202 push( @{ $data->{ $section } }, [ $ident, $message ] ); 203 $idents{ $ident } = 1; 204 $last_idx = @{ $data->{ $section } } - 1; 205 next; 206 }; # if 207 # Parse continuation line. 208 if ( $line =~ m{\A\s*"(.*)"\s*\z} ) { 209 my $message = $1; 210 if ( not defined( $last_idx ) ) { 211 $error->( $n, $line, "Unexpected continuation line" ); 212 }; # if 213 # Check %-sequences. 214 my $err = parse_message( $message ); 215 if ( $err ) { 216 $error->( $n, $line, $err ); 217 }; # if 218 # Save continuation. 219 $data->{ $section }->[ $last_idx ]->[ 1 ] .= $message; 220 next; 221 }; # if 222 $error->( $n, $line, "Message definition expected" ); 223 }; # foreach 224 $data->{ "%meta" }->{ source } = $name; 225 foreach my $section ( @sections ) { 226 if ( not exists( $data->{ $section } ) ) { 227 $data->{ $section } = []; 228 }; # if 229 }; # foreach $section 230 231 foreach my $property ( @properties ) { 232 if ( not defined( $data->{ "%meta" }->{ $property } ) ) { 233 runtime_error( 234 "Error parsing $name: " . 235 "Required \"$property\" property is not specified" 236 ); 237 }; # if 238 push( @{ $data->{ meta } }, [ $property, $data->{ "%meta" }->{ $property } ] ); 239 }; # foreach 240 241 return $data; 242 243}; # sub parse_source 244 245 246sub generate_enum($$$) { 247 248 my ( $data, $file, $prefix ) = @_; 249 my $bulk = ""; 250 251 $bulk = 252 _generate_comment( $data, "//", "//" ) . 253 "\n" . 254 "enum ${prefix}_id {\n\n" . 255 " // A special id for absence of message.\n" . 256 " ${prefix}_null = 0,\n\n"; 257 258 foreach my $section ( @sections ) { 259 my $props = $sections->{ $section }; # Section properties. 260 my $short = $props->{ short }; # Short section name, frequently used. 261 $bulk .= 262 " // Set #$props->{ set }, $props->{ long }.\n" . 263 " ${prefix}_${short}_first = $props->{ base },\n"; 264 foreach my $item ( @{ $data->{ $section } } ) { 265 my ( $ident, undef ) = @$item; 266 $bulk .= " ${prefix}_${short}_${ident},\n"; 267 }; # foreach 268 $bulk .= " ${prefix}_${short}_last,\n\n"; 269 }; # foreach $type 270 $bulk .= " ${prefix}_xxx_lastest\n\n"; 271 272 $bulk .= 273 "}; // enum ${prefix}_id\n" . 274 "\n" . 275 "typedef enum ${prefix}_id ${prefix}_id_t;\n" . 276 "\n"; 277 278 $bulk .= 279 "\n" . 280 "// end of file //\n"; 281 282 write_file( $file, \$bulk ); 283 284}; # sub generate_enum 285 286 287sub generate_signature($$) { 288 289 my ( $data, $file ) = @_; 290 my $bulk = ""; 291 292 $bulk .= "// message catalog signature file //\n\n"; 293 294 foreach my $section ( @sections ) { 295 my $props = $sections->{ $section }; # Section properties. 296 my $short = $props->{ short }; # Short section name, frequently used. 297 $bulk .= "-*- " . uc( $props->{ long } ) . "-*-\n\n"; 298 foreach my $item ( @{ $data->{ $section } } ) { 299 my ( $ident, $msg ) = @$item; 300 $bulk .= sprintf( "%-40s %s\n", $ident, msg2sgn( $msg ) ); 301 }; # foreach 302 $bulk .= "\n"; 303 }; # foreach $type 304 305 $bulk .= "// end of file //\n"; 306 307 write_file( $file, \$bulk ); 308 309}; # sub generate_signature 310 311 312sub generate_default($$$) { 313 314 my ( $data, $file, $prefix ) = @_; 315 my $bulk = ""; 316 317 $bulk .= 318 _generate_comment( $data, "//", "//" ) . 319 "\n"; 320 321 foreach my $section ( @sections ) { 322 $bulk .= 323 "static char const *\n" . 324 "__${prefix}_default_${section}" . "[] =\n" . 325 " {\n" . 326 " NULL,\n"; 327 foreach my $item ( @{ $data->{ $section } } ) { 328 my ( undef, $msg ) = @$item; 329 $bulk .= " \"" . msg2src( $msg ) . "\",\n"; 330 }; # while 331 $bulk .= 332 " NULL\n" . 333 " };\n" . 334 "\n"; 335 }; # foreach $type 336 337 $bulk .= 338 "struct kmp_i18n_section {\n" . 339 " int size;\n" . 340 " char const ** str;\n" . 341 "}; // struct kmp_i18n_section\n" . 342 "typedef struct kmp_i18n_section kmp_i18n_section_t;\n" . 343 "\n" . 344 "static kmp_i18n_section_t\n" . 345 "__${prefix}_sections[] =\n" . 346 " {\n" . 347 " { 0, NULL },\n"; 348 foreach my $section ( @sections ) { 349 $bulk .= 350 " { " . @{ $data->{ $section } } . ", __${prefix}_default_${section} },\n"; 351 }; # foreach $type 352 $bulk .= 353 " { 0, NULL }\n" . 354 " };\n" . 355 "\n"; 356 357 $bulk .= 358 "struct kmp_i18n_table {\n" . 359 " int size;\n" . 360 " kmp_i18n_section_t * sect;\n" . 361 "}; // struct kmp_i18n_table\n" . 362 "typedef struct kmp_i18n_table kmp_i18n_table_t;\n" . 363 "\n" . 364 "static kmp_i18n_table_t __kmp_i18n_default_table =\n" . 365 " {\n" . 366 " " . @sections . ",\n" . 367 " __kmp_i18n_sections\n" . 368 " };\n" . 369 "\n" . 370 "// end of file //\n"; 371 372 write_file( $file, \$bulk ); 373 374}; # sub generate_default 375 376 377sub generate_message_unix($$) { 378 379 my ( $data, $file ) = @_; 380 my $bulk = ""; 381 382 $bulk .= 383 _generate_comment( $data, "\$", "\$" ) . 384 "\n" . 385 "\$quote \"\n\n"; 386 387 foreach my $section ( @sections ) { 388 $bulk .= 389 "\$ " . ( "-" x 78 ) . "\n\$ $section\n\$ " . ( "-" x 78 ) . "\n\n" . 390 "\$set $sections->{ $section }->{ set }\n" . 391 "\n"; 392 my $n = 0; 393 foreach my $item ( @{ $data->{ $section } } ) { 394 my ( undef, $msg ) = @$item; 395 ++ $n; 396 $bulk .= "$n \"" . msg2src( $msg ) . "\"\n"; 397 }; # foreach 398 $bulk .= "\n"; 399 }; # foreach $type 400 401 $bulk .= 402 "\n" . 403 "\$ end of file \$\n"; 404 405 write_file( $file, \$bulk, -layer => ":utf8" ); 406 407}; # sub generate_message_linux 408 409 410sub generate_message_windows($$) { 411 412 my ( $data, $file ) = @_; 413 my $bulk = ""; 414 my $language = $data->{ "%meta" }->{ Language }; 415 my $langid = $data->{ "%meta" }->{ LangId }; 416 417 $bulk .= 418 _generate_comment( $data, ";", ";" ) . 419 "\n" . 420 "LanguageNames = ($language=$langid:msg_$langid)\n" . 421 "\n"; 422 423 $bulk .= 424 "FacilityNames=(\n"; 425 foreach my $section ( @sections ) { 426 my $props = $sections->{ $section }; # Section properties. 427 $bulk .= 428 " $props->{ short }=" . $props->{ set } ."\n"; 429 }; # foreach $section 430 $bulk .= 431 ")\n\n"; 432 433 foreach my $section ( @sections ) { 434 my $short = $sections->{ $section }->{ short }; 435 my $n = 0; 436 foreach my $item ( @{ $data->{ $section } } ) { 437 my ( undef, $msg ) = @$item; 438 ++ $n; 439 $bulk .= 440 "MessageId=$n\n" . 441 "Facility=$short\n" . 442 "Language=$language\n" . 443 msg2mc( $msg ) . "\n.\n\n"; 444 }; # foreach $item 445 }; # foreach $section 446 447 $bulk .= 448 "\n" . 449 "; end of file ;\n"; 450 451 $bulk = encode( "UTF-16LE", $bulk ); # Convert text to UTF-16LE used in Windows* OS. 452 write_file( $file, \$bulk, -binary => 1 ); 453 454}; # sub generate_message_windows 455 456 457# 458# Parse command line. 459# 460 461my $input_file; 462my $enum_file; 463my $signature_file; 464my $default_file; 465my $message_file; 466my $id; 467my $prefix = ""; 468get_options( 469 "os=s" => \$target_os, 470 "enum-file=s" => \$enum_file, 471 "signature-file=s" => \$signature_file, 472 "default-file=s" => \$default_file, 473 "message-file=s" => \$message_file, 474 "id|lang-id" => \$id, 475 "prefix=s" => \$prefix, 476); 477if ( @ARGV == 0 ) { 478 cmdline_error( "No source file specified -- nothing to do" ); 479}; # if 480if ( @ARGV > 1 ) { 481 cmdline_error( "Too many source files specified" ); 482}; # if 483$input_file = $ARGV[ 0 ]; 484 485 486my $generate_message; 487if ( $target_os =~ m{\A(?:lin|mac)\z} ) { 488 $generate_message = \&generate_message_unix; 489} elsif ( $target_os eq "win" ) { 490 $generate_message = \&generate_message_windows; 491} else { 492 runtime_error( "OS \"$target_os\" is not supported" ); 493}; # if 494 495 496# 497# Do the work. 498# 499 500my $data = parse_source( $input_file ); 501if ( defined( $id ) ) { 502 print( $data->{ "%meta" }->{ LangId }, "\n" ); 503}; # if 504if ( defined( $enum_file ) ) { 505 generate_enum( $data, $enum_file, $prefix ); 506}; # if 507if ( defined( $signature_file ) ) { 508 generate_signature( $data, $signature_file ); 509}; # if 510if ( defined( $default_file ) ) { 511 generate_default( $data, $default_file, $prefix ); 512}; # if 513if ( defined( $message_file ) ) { 514 $generate_message->( $data, $message_file ); 515}; # if 516 517exit( 0 ); 518 519__END__ 520 521=pod 522 523=head1 NAME 524 525B<message-converter.pl> -- Convert message catalog source file into another text forms. 526 527=head1 SYNOPSIS 528 529B<message-converter.pl> I<option>... <file> 530 531=head1 OPTIONS 532 533=over 534 535=item B<--enum-file=>I<file> 536 537Generate enum file named I<file>. 538 539=item B<--default-file=>I<file> 540 541Generate default messages file named I<file>. 542 543=item B<--lang-id> 544 545Print language identifier of the message catalog source file. 546 547=item B<--message-file=>I<file> 548 549Generate message file. 550 551=item B<--signature-file=>I<file> 552 553Generate signature file. 554 555Signatures are used for checking compatibility. For example, to check a primary 556catalog and its translation to another language, signatures of both catalogs should be generated 557and compared. If signatures are identical, catalogs are compatible. 558 559=item B<--prefix=>I<prefix> 560 561Prefix to be used for all C identifiers (type and variable names) in enum and default messages 562files. 563 564=item B<--os=>I<str> 565 566Specify OS name the message formats to be converted for. If not specified explicitly, value of 567LIBOMP_OS environment variable is used. If LIBOMP_OS is not defined, host OS is detected. 568 569Depending on OS, B<message-converter.pl> converts message formats to GNU style or MS style. 570 571=item Standard Options 572 573=over 574 575=item B<--doc> 576 577=item B<--manual> 578 579Print full documentation and exit. 580 581=item B<--help> 582 583Print short help message and exit. 584 585=item B<--version> 586 587Print version string and exit. 588 589=back 590 591=back 592 593=head1 ARGUMENTS 594 595=over 596 597=item I<file> 598 599A name of input file. 600 601=back 602 603=head1 DESCRIPTION 604 605=head2 Message Catalog File Format 606 607It is plain text file in UTF-8 encoding. Empty lines and lines beginning with sharp sign (C<#>) are 608ignored. EBNF syntax of content: 609 610 catalog = { section }; 611 section = header body; 612 header = "-*- " section-id " -*-" "\n"; 613 body = { message }; 614 message = message-id string "\n" { string "\n" }; 615 section-id = identifier; 616 message-id = "OBSOLETE" | identifier; 617 identifier = letter { letter | digit | "_" }; 618 string = """ { character } """; 619 620Identifier starts with letter, with following letters, digits, and underscores. Identifiers are 621case-sensitive. Setion identifiers are fixed: C<META>, C<STRINGS>, C<FORMATS>, C<MESSAGES> and 622C<HINTS>. Message identifiers must be unique within section. Special C<OBSOLETE> pseudo-identifier 623may be used many times. 624 625String is a C string literal which must not cross line boundaries. 626Long messages may occupy multiple lines, a string per line. 627 628Message may include printf-like GNU-style placeholders for arguments: C<%I<n>$I<t>>, 629where I<n> is argument number (C<1>, C<2>, ...), 630I<t> -- argument type, C<s> (string) or C<d> (32-bit integer). 631 632See also comments in F<i18n/en_US.txt>. 633 634=head2 Output Files 635 636This script can generate 3 different text files from single source: 637 638=over 639 640=item Enum file. 641 642Enum file is a C include file, containing definitions of message identifiers, e. g.: 643 644 enum kmp_i18n_id { 645 646 // Set #1, meta. 647 kmp_i18n_prp_first = 65536, 648 kmp_i18n_prp_Language, 649 kmp_i18n_prp_Country, 650 kmp_i18n_prp_LangId, 651 kmp_i18n_prp_Version, 652 kmp_i18n_prp_Revision, 653 kmp_i18n_prp_last, 654 655 // Set #2, strings. 656 kmp_i18n_str_first = 131072, 657 kmp_i18n_str_Error, 658 kmp_i18n_str_UnknownFile, 659 kmp_i18n_str_NotANumber, 660 ... 661 662 // Set #3, formats. 663 ... 664 665 kmp_i18n_xxx_lastest 666 667 }; // enum kmp_i18n_id 668 669 typedef enum kmp_i18n_id kmp_i18n_id_t; 670 671=item Default messages file. 672 673Default messages file is a C include file containing default messages to be embedded into 674application (and used if external message catalog does not exist or could not be open): 675 676 static char const * 677 __kmp_i18n_default_meta[] = 678 { 679 NULL, 680 "English", 681 "USA", 682 "1033", 683 "2", 684 "20090806", 685 NULL 686 }; 687 688 static char const * 689 __kmp_i18n_default_strings[] = 690 { 691 "Error", 692 "(unknown file)", 693 "not a number", 694 ... 695 NULL 696 }; 697 698 ... 699 700=item Message file. 701 702Message file is an input for message compiler, F<gencat> on Linux* OS and OS X*, or F<mc.exe> on 703Windows* OS. 704 705Here is the example of Linux* OS message file: 706 707 $quote " 708 1 "Japanese" 709 2 "Japan" 710 3 "1041" 711 4 "2" 712 5 "Based on English message catalog revision 20090806" 713 ... 714 715Example of Windows* OS message file: 716 717 LanguageNames = (Japanese=10041:msg_1041) 718 719 FacilityNames = ( 720 prp=1 721 str=2 722 fmt=3 723 ... 724 ) 725 726 MessageId=1 727 Facility=prp 728 Language=Japanese 729 Japanese 730 . 731 732 ... 733 734=item Signature. 735 736Signature is a processed source file: comments stripped, strings deleted, but placeholders kept and 737sorted. 738 739 -*- FORMATS-*- 740 741 Info %1$d %2$s 742 Warning %1$d %2$s 743 Fatal %1$d %2$s 744 SysErr %1$d %2$s 745 Hint %1$- %2$s 746 Pragma %1$s %2$s %3$s %4$s 747 748The purpose of signatures -- compare two message source files for compatibility. If signatures of 749two message sources are the same, binary message catalogs will be compatible. 750 751=back 752 753=head1 EXAMPLES 754 755Generate include file containing message identifiers: 756 757 $ message-converter.pl --enum-file=kmp_i18n_id.inc en_US.txt 758 759Generate include file contating default messages: 760 761 $ message-converter.pl --default-file=kmp_i18n_default.inc en_US.txt 762 763Generate input file for message compiler, Linux* OS example: 764 765 $ message-converter.pl --message-file=ru_RU.UTF-8.msg ru_RU.txt 766 767Generate input file for message compiler, Windows* OS example: 768 769 > message-converter.pl --message-file=ru_RU.UTF-8.mc ru_RU.txt 770 771=cut 772 773# end of file # 774 775