1#!/usr/bin/perl
2
3#
4#//===----------------------------------------------------------------------===//
5#//
6#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
7#// See https://llvm.org/LICENSE.txt for license information.
8#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
9#//
10#//===----------------------------------------------------------------------===//
11#
12
13use strict;
14use warnings;
15
16use File::Glob ":glob";
17use Encode qw{ encode };
18
19use FindBin;
20use lib "$FindBin::Bin/lib";
21
22use tools;
23
24our $VERSION = "0.04";
25my $escape      = qr{%};
26my $placeholder = qr{(\d)\$(s|l?[du])};
27my $target_os;
28
29my $sections =
30    {
31        meta     => { short => "prp" }, # "prp" stands for "property".
32        strings  => { short => "str" },
33        formats  => { short => "fmt" },
34        messages => { short => "msg" },
35        hints    => { short => "hnt" },
36    };
37my @sections = qw{ meta strings formats messages hints };
38# Assign section properties: long name, set number, base number.
39map( $sections->{ $sections[ $_ ] }->{ long } = $sections[ $_ ],      ( 0 .. @sections - 1 ) );
40map( $sections->{ $sections[ $_ ] }->{ set  } = ( $_ + 1 ),           ( 0 .. @sections - 1 ) );
41map( $sections->{ $sections[ $_ ] }->{ base } = ( ( $_ + 1 ) << 16 ), ( 0 .. @sections - 1 ) );
42
43# Properties of Meta section.
44my @properties = qw{ Language Country LangId Version Revision };
45
46
47sub _generate_comment($$$) {
48
49    my ( $data, $open, $close ) = @_;
50    my $bulk =
51        $open . " Do not edit this file! " . $close . "\n" .
52        $open . " The file was generated from " . get_file( $data->{ "%meta" }->{ source } ) .
53            " by " . $tool . " on " . localtime() . ". " . $close . "\n";
54    return $bulk;
55
56}; # sub _generate_comment
57
58
59sub msg2sgn($) {
60
61    # Convert message string to signature. Signature is a list of placeholders in sorted order.
62    # For example, signature of "%1$s value \"%2$s\" is invalid." is "%1$s %2$s".
63
64    my ( $msg ) = @_;
65    my @placeholders;
66    pos( $msg ) = 0;
67    while ( $msg =~ m{\G.*?$escape$placeholder}g ) {
68        $placeholders[ $1 - 1 ] = "%$1\$$2";
69    }; # while
70    for ( my $i = 1; $i <= @placeholders; ++ $i ) {
71        if ( not defined( $placeholders[ $i - 1 ] ) ) {
72            $placeholders[ $i - 1 ] = "%$i\$-";
73        }; # if
74    }; # for $i
75    return join( " ", @placeholders );
76
77}; # sub msg2sgn
78
79
80sub msg2src($) {
81
82    # Convert message string to a C string constant.
83
84    my ( $msg ) = @_;
85    if ( $target_os eq "win" ) {
86        $msg =~ s{$escape$placeholder}{\%$1!$2!}g;
87    }; # if
88    return $msg;
89
90}; # sub msg2src
91
92
93my $special =
94    {
95        "n" => "\n",
96        "t" => "\t",
97    };
98
99sub msg2mc($) {
100    my ( $msg ) = @_;
101    $msg = msg2src( $msg ); # Get windows style placeholders.
102    $msg =~ s{\\(.)}{ exists( $special->{ $1 } ) ? $special->{ $1 } : $1 }ge;
103    return $msg;
104}; # sub msg2mc
105
106
107
108sub parse_message($) {
109
110    my ( $msg ) = @_;
111    pos( $msg ) = 0;
112    for ( ; ; ) {
113        if ( $msg !~ m{\G.*?$escape}gc ) {
114            last;
115        }
116        if ( $msg !~ m{\G$placeholder}gc ) {
117            return "Bad %-sequence near \"%" . substr( $msg, pos( $msg ), 7 ) . "\"";
118        }; # if
119    }; # forever
120    return undef;
121
122}; # sub parse_message
123
124
125sub parse_source($) {
126
127    my ( $name ) = @_;
128
129    my @bulk = read_file( $name, -layer => ":utf8" );
130    my $data = {};
131
132    my $line;
133    my $n = 0;         # Line number.
134    my $obsolete = 0;  # Counter of obsolete entries.
135    my $last_idx;
136    my %idents;
137    my $section;
138
139    my $error =
140        sub {
141            my ( $n, $line, $msg ) = @_;
142            runtime_error( "Error parsing $name line $n: " . "$msg:\n" . "    $line" );
143        }; # sub
144
145    foreach $line ( @bulk ) {
146        ++ $n;
147        # Skip empty lines and comments.
148        if ( $line =~ m{\A\s*(\n|#)} ) {
149            $last_idx = undef;
150            next;
151        }; # if
152        # Parse section header.
153        if ( $line =~ m{\A-\*-\s*([A-Z_]*)\s*-\*-\s*\n\z}i ) {
154            $section = ( lc( $1 ) );
155            if ( not grep( $section eq $_, @sections ) ) {
156                $error->( $n, $line, "Unknown section \"$section\" specified" );
157            }; # if
158            if ( exists( $data->{ $section } ) ) {
159                $error->( $n, $line, "Multiple sections of the same type specified" );
160            }; # if
161            %idents = ();     # Clean list of known message identifiers.
162            next;
163        }; # if
164        if ( not defined( $section ) ) {
165            $error->( $n, $line, "Section heading expected" );
166        }; # if
167        # Parse section body.
168        if ( $section eq "meta" ) {
169            if ( $line =~ m{\A([A-Z_][A-Z_0-9]*)\s+"(.*)"\s*?\n?\z}i ) {
170                # Parse meta properties (such as Language, Country, and LangId).
171                my ( $property, $value ) = ( $1, $2 );
172                if ( not grep( $_ eq $property , @properties ) ) {
173                    $error->( $n, $line, "Unknown property \"$property\" specified" );
174                }; # if
175                if ( exists( $data->{ "%meta" }->{ $property } ) ) {
176                    $error->( $n, $line, "Property \"$property\" has already been specified" );
177                }; # if
178                $data->{ "%meta" }->{ $property } = $value;
179                $last_idx = undef;
180                next;
181            }; # if
182            $error->( $n, $line, "Property line expected" );
183        }; # if
184        # Parse message.
185        if ( $line =~ m{\A([A-Z_][A-Z_0-9]*)\s+"(.*)"\s*?\n?\z}i ) {
186            my ( $ident, $message ) = ( $1, $2 );
187            if ( $ident eq "OBSOLETE" ) {
188                # If id is "OBSOLETE", add a unique suffix. It provides convenient way to mark
189                # obsolete messages.
190                ++ $obsolete;
191                $ident .= $obsolete;
192            }; # if
193            if ( exists( $idents{ $ident } ) ) {
194                $error->( $n, $line, "Identifier \"$ident\" is redefined" );
195            }; # if
196            # Check %-sequences.
197            my $err = parse_message( $message );
198            if ( $err ) {
199                $error->( $n, $line, $err );
200            }; # if
201            # Save message.
202            push( @{ $data->{ $section } }, [ $ident, $message ] );
203            $idents{ $ident } = 1;
204            $last_idx = @{ $data->{ $section } } - 1;
205            next;
206        }; # if
207        # Parse continuation line.
208        if ( $line =~ m{\A\s*"(.*)"\s*\z} ) {
209            my $message = $1;
210            if ( not defined( $last_idx )  ) {
211                $error->( $n, $line, "Unexpected continuation line" );
212            }; # if
213            # Check %-sequences.
214            my $err = parse_message( $message );
215            if ( $err ) {
216                $error->( $n, $line, $err );
217            }; # if
218            # Save continuation.
219            $data->{ $section }->[ $last_idx ]->[ 1 ] .= $message;
220            next;
221        }; # if
222        $error->( $n, $line, "Message definition expected" );
223    }; # foreach
224    $data->{ "%meta" }->{ source } = $name;
225    foreach my $section ( @sections ) {
226        if ( not exists( $data->{ $section } ) ) {
227            $data->{ $section } = [];
228        }; # if
229    }; # foreach $section
230
231    foreach my $property ( @properties ) {
232        if ( not defined( $data->{ "%meta" }->{ $property } ) ) {
233            runtime_error(
234                "Error parsing $name: " .
235                    "Required \"$property\" property is not specified"
236            );
237        }; # if
238        push( @{ $data->{ meta } }, [ $property, $data->{ "%meta" }->{ $property } ] );
239    }; # foreach
240
241    return $data;
242
243}; # sub parse_source
244
245
246sub generate_enum($$$) {
247
248    my ( $data, $file, $prefix ) = @_;
249    my $bulk = "";
250
251    $bulk =
252        _generate_comment( $data, "//", "//" ) .
253        "\n" .
254        "enum ${prefix}_id {\n\n" .
255        "    // A special id for absence of message.\n" .
256        "    ${prefix}_null = 0,\n\n";
257
258    foreach my $section ( @sections ) {
259        my $props = $sections->{ $section };    # Section properties.
260        my $short = $props->{ short };          # Short section name, frequently used.
261        $bulk .=
262            "    // Set #$props->{ set }, $props->{ long }.\n" .
263            "    ${prefix}_${short}_first = $props->{ base },\n";
264        foreach my $item ( @{ $data->{ $section } } ) {
265            my ( $ident, undef ) = @$item;
266            $bulk .= "    ${prefix}_${short}_${ident},\n";
267        }; # foreach
268        $bulk .= "    ${prefix}_${short}_last,\n\n";
269    }; # foreach $type
270    $bulk .= "    ${prefix}_xxx_lastest\n\n";
271
272    $bulk .=
273        "}; // enum ${prefix}_id\n" .
274        "\n" .
275        "typedef enum ${prefix}_id  ${prefix}_id_t;\n" .
276        "\n";
277
278    $bulk .=
279        "\n" .
280        "// end of file //\n";
281
282    write_file( $file, \$bulk );
283
284}; # sub generate_enum
285
286
287sub generate_signature($$) {
288
289    my ( $data, $file ) = @_;
290    my $bulk = "";
291
292    $bulk .= "// message catalog signature file //\n\n";
293
294    foreach my $section ( @sections ) {
295        my $props = $sections->{ $section };    # Section properties.
296        my $short = $props->{ short };          # Short section name, frequently used.
297        $bulk .= "-*- " . uc( $props->{ long } ) . "-*-\n\n";
298        foreach my $item ( @{ $data->{ $section } } ) {
299            my ( $ident, $msg ) = @$item;
300            $bulk .= sprintf( "%-40s %s\n", $ident, msg2sgn( $msg ) );
301        }; # foreach
302        $bulk .= "\n";
303    }; # foreach $type
304
305    $bulk .= "// end of file //\n";
306
307    write_file( $file, \$bulk );
308
309}; # sub generate_signature
310
311
312sub generate_default($$$) {
313
314    my ( $data, $file, $prefix ) = @_;
315    my $bulk = "";
316
317    $bulk .=
318        _generate_comment( $data, "//", "//" ) .
319        "\n";
320
321    foreach my $section ( @sections ) {
322        $bulk .=
323            "static char const *\n" .
324            "__${prefix}_default_${section}" . "[] =\n" .
325            "    {\n" .
326            "        NULL,\n";
327        foreach my $item ( @{ $data->{ $section } } ) {
328            my ( undef, $msg ) = @$item;
329            $bulk .= "        \"" . msg2src( $msg ) . "\",\n";
330        }; # while
331        $bulk .=
332            "        NULL\n" .
333            "    };\n" .
334            "\n";
335    }; # foreach $type
336
337    $bulk .=
338        "struct kmp_i18n_section {\n" .
339        "    int           size;\n" .
340        "    char const ** str;\n" .
341        "}; // struct kmp_i18n_section\n" .
342        "typedef struct kmp_i18n_section  kmp_i18n_section_t;\n" .
343        "\n" .
344        "static kmp_i18n_section_t\n" .
345        "__${prefix}_sections[] =\n" .
346        "    {\n" .
347        "        { 0, NULL },\n";
348    foreach my $section ( @sections ) {
349        $bulk .=
350            "        { " . @{ $data->{ $section } } . ", __${prefix}_default_${section} },\n";
351    }; # foreach $type
352    $bulk .=
353        "        { 0, NULL }\n" .
354        "    };\n" .
355        "\n";
356
357    $bulk .=
358        "struct kmp_i18n_table {\n" .
359        "    int                   size;\n" .
360        "    kmp_i18n_section_t *  sect;\n" .
361        "}; // struct kmp_i18n_table\n" .
362        "typedef struct kmp_i18n_table  kmp_i18n_table_t;\n" .
363        "\n" .
364        "static kmp_i18n_table_t __kmp_i18n_default_table =\n" .
365        "    {\n" .
366        "        " . @sections . ",\n" .
367        "        __kmp_i18n_sections\n" .
368        "    };\n" .
369        "\n" .
370        "// end of file //\n";
371
372    write_file( $file, \$bulk );
373
374}; # sub generate_default
375
376
377sub generate_message_unix($$) {
378
379    my ( $data, $file ) = @_;
380    my $bulk     = "";
381
382    $bulk .=
383        _generate_comment( $data, "\$", "\$" ) .
384        "\n" .
385        "\$quote \"\n\n";
386
387    foreach my $section ( @sections ) {
388        $bulk .=
389            "\$ " . ( "-" x 78 ) . "\n\$ $section\n\$ " . ( "-" x 78 ) . "\n\n" .
390            "\$set $sections->{ $section }->{ set }\n" .
391            "\n";
392        my $n = 0;
393        foreach my $item ( @{ $data->{ $section } } ) {
394            my ( undef, $msg ) = @$item;
395            ++ $n;
396            $bulk .= "$n \"" . msg2src( $msg ) . "\"\n";
397        }; # foreach
398        $bulk .= "\n";
399    }; # foreach $type
400
401    $bulk .=
402        "\n" .
403        "\$ end of file \$\n";
404
405    write_file( $file, \$bulk, -layer => ":utf8" );
406
407}; # sub generate_message_linux
408
409
410sub generate_message_windows($$) {
411
412    my ( $data, $file ) = @_;
413    my $bulk = "";
414    my $language = $data->{ "%meta" }->{ Language };
415    my $langid   = $data->{ "%meta" }->{ LangId };
416
417    $bulk .=
418        _generate_comment( $data, ";", ";" ) .
419        "\n" .
420        "LanguageNames = ($language=$langid:msg_$langid)\n" .
421        "\n";
422
423    $bulk .=
424        "FacilityNames=(\n";
425    foreach my $section ( @sections ) {
426        my $props = $sections->{ $section };    # Section properties.
427        $bulk .=
428            " $props->{ short }=" . $props->{ set } ."\n";
429    }; # foreach $section
430    $bulk .=
431        ")\n\n";
432
433    foreach my $section ( @sections ) {
434        my $short = $sections->{ $section }->{ short };
435        my $n = 0;
436        foreach my $item ( @{ $data->{ $section } } ) {
437            my ( undef, $msg ) = @$item;
438            ++ $n;
439            $bulk .=
440                "MessageId=$n\n" .
441                "Facility=$short\n" .
442                "Language=$language\n" .
443                msg2mc( $msg ) . "\n.\n\n";
444        }; # foreach $item
445    }; # foreach $section
446
447    $bulk .=
448        "\n" .
449        "; end of file ;\n";
450
451    $bulk = encode( "UTF-16LE", $bulk ); # Convert text to UTF-16LE used in Windows* OS.
452    write_file( $file, \$bulk, -binary => 1 );
453
454}; # sub generate_message_windows
455
456
457#
458# Parse command line.
459#
460
461my $input_file;
462my $enum_file;
463my $signature_file;
464my $default_file;
465my $message_file;
466my $id;
467my $prefix = "";
468get_options(
469    "os=s"             => \$target_os,
470    "enum-file=s"      => \$enum_file,
471    "signature-file=s" => \$signature_file,
472    "default-file=s"   => \$default_file,
473    "message-file=s"   => \$message_file,
474    "id|lang-id"       => \$id,
475    "prefix=s"	       => \$prefix,
476);
477if ( @ARGV == 0 ) {
478    cmdline_error( "No source file specified -- nothing to do" );
479}; # if
480if ( @ARGV > 1 ) {
481    cmdline_error( "Too many source files specified" );
482}; # if
483$input_file = $ARGV[ 0 ];
484
485
486my $generate_message;
487if ( $target_os =~ m{\A(?:lin|mac)\z} ) {
488    $generate_message = \&generate_message_unix;
489} elsif ( $target_os eq "win" ) {
490    $generate_message = \&generate_message_windows;
491} else {
492    runtime_error( "OS \"$target_os\" is not supported" );
493}; # if
494
495
496#
497# Do the work.
498#
499
500my $data = parse_source( $input_file );
501if ( defined( $id ) ) {
502    print( $data->{ "%meta" }->{ LangId }, "\n" );
503}; # if
504if ( defined( $enum_file ) ) {
505    generate_enum( $data, $enum_file, $prefix );
506}; # if
507if ( defined( $signature_file ) ) {
508    generate_signature( $data, $signature_file );
509}; # if
510if ( defined( $default_file ) ) {
511    generate_default( $data, $default_file, $prefix );
512}; # if
513if ( defined( $message_file ) ) {
514    $generate_message->( $data, $message_file );
515}; # if
516
517exit( 0 );
518
519__END__
520
521=pod
522
523=head1 NAME
524
525B<message-converter.pl> -- Convert message catalog source file into another text forms.
526
527=head1 SYNOPSIS
528
529B<message-converter.pl> I<option>... <file>
530
531=head1 OPTIONS
532
533=over
534
535=item B<--enum-file=>I<file>
536
537Generate enum file named I<file>.
538
539=item B<--default-file=>I<file>
540
541Generate default messages file named I<file>.
542
543=item B<--lang-id>
544
545Print language identifier of the message catalog source file.
546
547=item B<--message-file=>I<file>
548
549Generate message file.
550
551=item B<--signature-file=>I<file>
552
553Generate signature file.
554
555Signatures are used for checking compatibility. For example, to check a primary
556catalog and its translation to another language, signatures of both catalogs should be generated
557and compared. If signatures are identical, catalogs are compatible.
558
559=item B<--prefix=>I<prefix>
560
561Prefix to be used for all C identifiers (type and variable names) in enum and default messages
562files.
563
564=item B<--os=>I<str>
565
566Specify OS name the message formats to be converted for. If not specified explicitly, value of
567LIBOMP_OS environment variable is used. If LIBOMP_OS is not defined, host OS is detected.
568
569Depending on OS, B<message-converter.pl> converts message formats to GNU style or MS style.
570
571=item Standard Options
572
573=over
574
575=item B<--doc>
576
577=item B<--manual>
578
579Print full documentation and exit.
580
581=item B<--help>
582
583Print short help message and exit.
584
585=item B<--version>
586
587Print version string and exit.
588
589=back
590
591=back
592
593=head1 ARGUMENTS
594
595=over
596
597=item I<file>
598
599A name of input file.
600
601=back
602
603=head1 DESCRIPTION
604
605=head2 Message Catalog File Format
606
607It is plain text file in UTF-8 encoding. Empty lines and lines beginning with sharp sign (C<#>) are
608ignored. EBNF syntax of content:
609
610    catalog    = { section };
611    section    = header body;
612    header     = "-*- " section-id " -*-" "\n";
613    body       = { message };
614    message    = message-id string "\n" { string "\n" };
615    section-id = identifier;
616    message-id = "OBSOLETE" | identifier;
617    identifier = letter { letter | digit | "_" };
618    string     = """ { character } """;
619
620Identifier starts with letter, with following letters, digits, and underscores. Identifiers are
621case-sensitive. Setion identifiers are fixed: C<META>, C<STRINGS>, C<FORMATS>, C<MESSAGES> and
622C<HINTS>. Message identifiers must be unique within section. Special C<OBSOLETE> pseudo-identifier
623may be used many times.
624
625String is a C string literal which must not cross line boundaries.
626Long messages may occupy multiple lines, a string per line.
627
628Message may include printf-like GNU-style placeholders for arguments: C<%I<n>$I<t>>,
629where I<n> is argument number (C<1>, C<2>, ...),
630I<t> -- argument type, C<s> (string) or C<d> (32-bit integer).
631
632See also comments in F<i18n/en_US.txt>.
633
634=head2 Output Files
635
636This script can generate 3 different text files from single source:
637
638=over
639
640=item Enum file.
641
642Enum file is a C include file, containing definitions of message identifiers, e. g.:
643
644    enum kmp_i18n_id {
645
646        // Set #1, meta.
647        kmp_i18n_prp_first = 65536,
648        kmp_i18n_prp_Language,
649        kmp_i18n_prp_Country,
650        kmp_i18n_prp_LangId,
651        kmp_i18n_prp_Version,
652        kmp_i18n_prp_Revision,
653        kmp_i18n_prp_last,
654
655        // Set #2, strings.
656        kmp_i18n_str_first = 131072,
657        kmp_i18n_str_Error,
658        kmp_i18n_str_UnknownFile,
659        kmp_i18n_str_NotANumber,
660        ...
661
662        // Set #3, formats.
663        ...
664
665        kmp_i18n_xxx_lastest
666
667    }; // enum kmp_i18n_id
668
669    typedef enum kmp_i18n_id  kmp_i18n_id_t;
670
671=item Default messages file.
672
673Default messages file is a C include file containing default messages to be embedded into
674application (and used if external message catalog does not exist or could not be open):
675
676    static char const *
677    __kmp_i18n_default_meta[] =
678        {
679            NULL,
680            "English",
681            "USA",
682            "1033",
683            "2",
684            "20090806",
685            NULL
686        };
687
688    static char const *
689    __kmp_i18n_default_strings[] =
690        {
691            "Error",
692            "(unknown file)",
693            "not a number",
694            ...
695            NULL
696        };
697
698    ...
699
700=item Message file.
701
702Message file is an input for message compiler, F<gencat> on Linux* OS and OS X*, or F<mc.exe> on
703Windows* OS.
704
705Here is the example of Linux* OS message file:
706
707    $quote "
708    1 "Japanese"
709    2 "Japan"
710    3 "1041"
711    4 "2"
712    5 "Based on English message catalog revision 20090806"
713    ...
714
715Example of Windows* OS message file:
716
717    LanguageNames = (Japanese=10041:msg_1041)
718
719    FacilityNames = (
720     prp=1
721     str=2
722     fmt=3
723     ...
724    )
725
726    MessageId=1
727    Facility=prp
728    Language=Japanese
729    Japanese
730    .
731
732    ...
733
734=item Signature.
735
736Signature is a processed source file: comments stripped, strings deleted, but placeholders kept and
737sorted.
738
739    -*- FORMATS-*-
740
741    Info                                     %1$d %2$s
742    Warning                                  %1$d %2$s
743    Fatal                                    %1$d %2$s
744    SysErr                                   %1$d %2$s
745    Hint                                     %1$- %2$s
746    Pragma                                   %1$s %2$s %3$s %4$s
747
748The purpose of signatures -- compare two message source files for compatibility. If signatures of
749two message sources are the same, binary message catalogs will be compatible.
750
751=back
752
753=head1 EXAMPLES
754
755Generate include file containing message identifiers:
756
757    $ message-converter.pl --enum-file=kmp_i18n_id.inc en_US.txt
758
759Generate include file contating default messages:
760
761    $ message-converter.pl --default-file=kmp_i18n_default.inc en_US.txt
762
763Generate input file for message compiler, Linux* OS example:
764
765    $ message-converter.pl --message-file=ru_RU.UTF-8.msg ru_RU.txt
766
767Generate input file for message compiler, Windows* OS example:
768
769    > message-converter.pl --message-file=ru_RU.UTF-8.mc ru_RU.txt
770
771=cut
772
773# end of file #
774
775