runtime/tools/message-converter.pl

#!/usr/bin/perl

#
#//===----------------------------------------------------------------------===//
#//
#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
#// See https://llvm.org/LICENSE.txt for license information.
#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#//
#//===----------------------------------------------------------------------===//
#

use strict;
use warnings;

use File::Glob ":glob";
use Encode qw{ encode };

use FindBin;
use lib "$FindBin::Bin/lib";

use tools;

our $VERSION = "0.04";
my $escape      = qr{%};
my $placeholder = qr{(\d)\$(s|l?[du])};
my $target_os;

my $sections =
    {
        meta     => { short => "prp" }, # "prp" stands for "property".
        strings  => { short => "str" },
        formats  => { short => "fmt" },
        messages => { short => "msg" },
        hints    => { short => "hnt" },
    };
my @sections = qw{ meta strings formats messages hints };
# Assign section properties: long name, set number, base number.
map( $sections->{ $sections[ $_ ] }->{ long } = $sections[ $_ ],      ( 0 .. @sections - 1 ) );
map( $sections->{ $sections[ $_ ] }->{ set  } = ( $_ + 1 ),           ( 0 .. @sections - 1 ) );
map( $sections->{ $sections[ $_ ] }->{ base } = ( ( $_ + 1 ) << 16 ), ( 0 .. @sections - 1 ) );

# Properties of Meta section.
my @properties = qw{ Language Country LangId Version Revision };


sub _generate_comment($$$) {

    my ( $data, $open, $close ) = @_;
    my $bulk =
        $open . " Do not edit this file! " . $close . "\n" .
        $open . " The file was generated from " . get_file( $data->{ "%meta" }->{ source } ) .
            " by " . $tool . " on " . localtime() . ". " . $close . "\n";
    return $bulk;

}; # sub _generate_comment


sub msg2sgn($) {

    # Convert message string to signature. Signature is a list of placeholders in sorted order.
    # For example, signature of "%1$s value \"%2$s\" is invalid." is "%1$s %2$s".

    my ( $msg ) = @_;
    my @placeholders;
    pos( $msg ) = 0;
    while ( $msg =~ m{\G.*?$escape$placeholder}g ) {
        $placeholders[ $1 - 1 ] = "%$1\$$2";
    }; # while
    for ( my $i = 1; $i <= @placeholders; ++ $i ) {
        if ( not defined( $placeholders[ $i - 1 ] ) ) {
            $placeholders[ $i - 1 ] = "%$i\$-";
        }; # if
    }; # for $i
    return join( " ", @placeholders );

}; # sub msg2sgn


sub msg2src($) {

    # Convert message string to a C string constant.

    my ( $msg ) = @_;
    if ( $target_os eq "win" ) {
        $msg =~ s{$escape$placeholder}{\%$1!$2!}g;
    }; # if
    return $msg;

}; # sub msg2src


my $special =
    {
        "n" => "\n",
        "t" => "\t",
    };

sub msg2mc($) {
    my ( $msg ) = @_;
    $msg = msg2src( $msg ); # Get windows style placeholders.
    $msg =~ s{\\(.)}{ exists( $special->{ $1 } ) ? $special->{ $1 } : $1 }ge;
    return $msg;
}; # sub msg2mc


sub parse_message($) {

    my ( $msg ) = @_;
    pos( $msg ) = 0;
    for ( ; ; ) {
        if ( $msg !~ m{\G.*?$escape}gc ) {
            last;
        }
        if ( $msg !~ m{\G$placeholder}gc ) {
            return "Bad %-sequence near \"%" . substr( $msg, pos( $msg ), 7 ) . "\"";
        }; # if
    }; # forever
    return undef;

}; # sub parse_message


sub parse_source($) {

    my ( $name ) = @_;

    my @bulk = read_file( $name, -layer => ":utf8" );
    my $data = {};

    my $line;
    my $n = 0;         # Line number.
    my $obsolete = 0;  # Counter of obsolete entries.
    my $last_idx;
    my %idents;
    my $section;

    my $error =
        sub {
            my ( $n, $line, $msg ) = @_;
            runtime_error( "Error parsing $name line $n: " . "$msg:\n" . "    $line" );
        }; # sub

    foreach $line ( @bulk ) {
        ++ $n;
        # Skip empty lines and comments.
        if ( $line =~ m{\A\s*(\n|#)} ) {
            $last_idx = undef;
            next;
        }; # if
        # Parse section header.
        if ( $line =~ m{\A-\*-\s*([A-Z_]*)\s*-\*-\s*\n\z}i ) {
            $section = ( lc( $1 ) );
            if ( not grep( $section eq $_, @sections ) ) {
                $error->( $n, $line, "Unknown section \"$section\" specified" );
            }; # if
            if ( exists( $data->{ $section } ) ) {
                $error->( $n, $line, "Multiple sections of the same type specified" );
            }; # if
            %idents = ();     # Clean list of known message identifiers.
            next;
        }; # if
        if ( not defined( $section ) ) {
            $error->( $n, $line, "Section heading expected" );
        }; # if
        # Parse section body.
        if ( $section eq "meta" ) {
            if ( $line =~ m{\A([A-Z_][A-Z_0-9]*)\s+"(.*)"\s*?\n?\z}i ) {
                # Parse meta properties (such as Language, Country, and LangId).
                my ( $property, $value ) = ( $1, $2 );
                if ( not grep( $_ eq $property , @properties ) ) {
                    $error->( $n, $line, "Unknown property \"$property\" specified" );
                }; # if
                if ( exists( $data->{ "%meta" }->{ $property } ) ) {
                    $error->( $n, $line, "Property \"$property\" has already been specified" );
                }; # if
                $data->{ "%meta" }->{ $property } = $value;
                $last_idx = undef;
                next;
            }; # if
            $error->( $n, $line, "Property line expected" );
        }; # if
        # Parse message.
        if ( $line =~ m{\A([A-Z_][A-Z_0-9]*)\s+"(.*)"\s*?\n?\z}i ) {
            my ( $ident, $message ) = ( $1, $2 );
            if ( $ident eq "OBSOLETE" ) {
                # If id is "OBSOLETE", add a unique suffix. It provides convenient way to mark
                # obsolete messages.
                ++ $obsolete;
                $ident .= $obsolete;
            }; # if
            if ( exists( $idents{ $ident } ) ) {
                $error->( $n, $line, "Identifier \"$ident\" is redefined" );
            }; # if
            # Check %-sequences.
            my $err = parse_message( $message );
            if ( $err ) {
                $error->( $n, $line, $err );
            }; # if
            # Save message.
            push( @{ $data->{ $section } }, [ $ident, $message ] );
            $idents{ $ident } = 1;
            $last_idx = @{ $data->{ $section } } - 1;
            next;
        }; # if
        # Parse continuation line.
        if ( $line =~ m{\A\s*"(.*)"\s*\z} ) {
            my $message = $1;
            if ( not defined( $last_idx )  ) {
                $error->( $n, $line, "Unexpected continuation line" );
            }; # if
            # Check %-sequences.
            my $err = parse_message( $message );
            if ( $err ) {
                $error->( $n, $line, $err );
            }; # if
            # Save continuation.
            $data->{ $section }->[ $last_idx ]->[ 1 ] .= $message;
            next;
        }; # if
        $error->( $n, $line, "Message definition expected" );
    }; # foreach
    $data->{ "%meta" }->{ source } = $name;
    foreach my $section ( @sections ) {
        if ( not exists( $data->{ $section } ) ) {
            $data->{ $section } = [];
        }; # if
    }; # foreach $section

    foreach my $property ( @properties ) {
        if ( not defined( $data->{ "%meta" }->{ $property } ) ) {
            runtime_error(
                "Error parsing $name: " .
                    "Required \"$property\" property is not specified"
            );
        }; # if
        push( @{ $data->{ meta } }, [ $property, $data->{ "%meta" }->{ $property } ] );
    }; # foreach

    return $data;

}; # sub parse_source


sub generate_enum($$$) {

    my ( $data, $file, $prefix ) = @_;
    my $bulk = "";

    $bulk =
        _generate_comment( $data, "//", "//" ) .
        "\n" .
        "enum ${prefix}_id {\n\n" .
        "    // A special id for absence of message.\n" .
        "    ${prefix}_null = 0,\n\n";

    foreach my $section ( @sections ) {
        my $props = $sections->{ $section };    # Section properties.
        my $short = $props->{ short };          # Short section name, frequently used.
        $bulk .=
            "    // Set #$props->{ set }, $props->{ long }.\n" .
            "    ${prefix}_${short}_first = $props->{ base },\n";
        foreach my $item ( @{ $data->{ $section } } ) {
            my ( $ident, undef ) = @$item;
            $bulk .= "    ${prefix}_${short}_${ident},\n";
        }; # foreach
        $bulk .= "    ${prefix}_${short}_last,\n\n";
    }; # foreach $type
    $bulk .= "    ${prefix}_xxx_lastest\n\n";

    $bulk .=
        "}; // enum ${prefix}_id\n" .
        "\n" .
        "typedef enum ${prefix}_id  ${prefix}_id_t;\n" .
        "\n";

    $bulk .=
        "\n" .
        "// end of file //\n";

    write_file( $file, \$bulk );

}; # sub generate_enum


sub generate_signature($$) {

    my ( $data, $file ) = @_;
    my $bulk = "";

    $bulk .= "// message catalog signature file //\n\n";

    foreach my $section ( @sections ) {
        my $props = $sections->{ $section };    # Section properties.
        my $short = $props->{ short };          # Short section name, frequently used.
        $bulk .= "-*- " . uc( $props->{ long } ) . "-*-\n\n";
        foreach my $item ( @{ $data->{ $section } } ) {
            my ( $ident, $msg ) = @$item;
            $bulk .= sprintf( "%-40s %s\n", $ident, msg2sgn( $msg ) );
        }; # foreach
        $bulk .= "\n";
    }; # foreach $type

    $bulk .= "// end of file //\n";

    write_file( $file, \$bulk );

}; # sub generate_signature


sub generate_default($$$) {

    my ( $data, $file, $prefix ) = @_;
    my $bulk = "";

    $bulk .=
        _generate_comment( $data, "//", "//" ) .
        "\n";

    foreach my $section ( @sections ) {
        $bulk .=
            "static char const *\n" .
            "__${prefix}_default_${section}" . "[] =\n" .
            "    {\n" .
            "        NULL,\n";
        foreach my $item ( @{ $data->{ $section } } ) {
            my ( undef, $msg ) = @$item;
            $bulk .= "        \"" . msg2src( $msg ) . "\",\n";
        }; # while
        $bulk .=
            "        NULL\n" .
            "    };\n" .
            "\n";
    }; # foreach $type

    $bulk .=
        "struct kmp_i18n_section {\n" .
        "    int           size;\n" .
        "    char const ** str;\n" .
        "}; // struct kmp_i18n_section\n" .
        "typedef struct kmp_i18n_section  kmp_i18n_section_t;\n" .
        "\n" .
        "static kmp_i18n_section_t\n" .
        "__${prefix}_sections[] =\n" .
        "    {\n" .
        "        { 0, NULL },\n";
    foreach my $section ( @sections ) {
        $bulk .=
            "        { " . @{ $data->{ $section } } . ", __${prefix}_default_${section} },\n";
    }; # foreach $type
    $bulk .=
        "        { 0, NULL }\n" .
        "    };\n" .
        "\n";

    $bulk .=
        "struct kmp_i18n_table {\n" .
        "    int                   size;\n" .
        "    kmp_i18n_section_t *  sect;\n" .
        "}; // struct kmp_i18n_table\n" .
        "typedef struct kmp_i18n_table  kmp_i18n_table_t;\n" .
        "\n" .
        "static kmp_i18n_table_t __kmp_i18n_default_table =\n" .
        "    {\n" .
        "        " . @sections . ",\n" .
        "        __kmp_i18n_sections\n" .
        "    };\n" .
        "\n" .
        "// end of file //\n";

    write_file( $file, \$bulk );

}; # sub generate_default


sub generate_message_unix($$) {

    my ( $data, $file ) = @_;
    my $bulk     = "";

    $bulk .=
        _generate_comment( $data, "\$", "\$" ) .
        "\n" .
        "\$quote \"\n\n";

    foreach my $section ( @sections ) {
        $bulk .=
            "\$ " . ( "-" x 78 ) . "\n\$ $section\n\$ " . ( "-" x 78 ) . "\n\n" .
            "\$set $sections->{ $section }->{ set }\n" .
            "\n";
        my $n = 0;
        foreach my $item ( @{ $data->{ $section } } ) {
            my ( undef, $msg ) = @$item;
            ++ $n;
            $bulk .= "$n \"" . msg2src( $msg ) . "\"\n";
        }; # foreach
        $bulk .= "\n";
    }; # foreach $type

    $bulk .=
        "\n" .
        "\$ end of file \$\n";

    write_file( $file, \$bulk, -layer => ":utf8" );

}; # sub generate_message_linux


sub generate_message_windows($$) {

    my ( $data, $file ) = @_;
    my $bulk = "";
    my $language = $data->{ "%meta" }->{ Language };
    my $langid   = $data->{ "%meta" }->{ LangId };

    $bulk .=
        _generate_comment( $data, ";", ";" ) .
        "\n" .
        "LanguageNames = ($language=$langid:msg_$langid)\n" .
        "\n";

    $bulk .=
        "FacilityNames=(\n";
    foreach my $section ( @sections ) {
        my $props = $sections->{ $section };    # Section properties.
        $bulk .=
            " $props->{ short }=" . $props->{ set } ."\n";
    }; # foreach $section
    $bulk .=
        ")\n\n";

    foreach my $section ( @sections ) {
        my $short = $sections->{ $section }->{ short };
        my $n = 0;
        foreach my $item ( @{ $data->{ $section } } ) {
            my ( undef, $msg ) = @$item;
            ++ $n;
            $bulk .=
                "MessageId=$n\n" .
                "Facility=$short\n" .
                "Language=$language\n" .
                msg2mc( $msg ) . "\n.\n\n";
        }; # foreach $item
    }; # foreach $section

    $bulk .=
        "\n" .
        "; end of file ;\n";

    $bulk = encode( "UTF-16LE", $bulk ); # Convert text to UTF-16LE used in Windows* OS.
    write_file( $file, \$bulk, -binary => 1 );

}; # sub generate_message_windows


#
# Parse command line.
#

my $input_file;
my $enum_file;
my $signature_file;
my $default_file;
my $message_file;
my $id;
my $prefix = "";
get_options(
    "os=s"             => \$target_os,
    "enum-file=s"      => \$enum_file,
    "signature-file=s" => \$signature_file,
    "default-file=s"   => \$default_file,
    "message-file=s"   => \$message_file,
    "id|lang-id"       => \$id,
    "prefix=s"	       => \$prefix,
);
if ( @ARGV == 0 ) {
    cmdline_error( "No source file specified -- nothing to do" );
}; # if
if ( @ARGV > 1 ) {
    cmdline_error( "Too many source files specified" );
}; # if
$input_file = $ARGV[ 0 ];


my $generate_message;
if ( $target_os =~ m{\A(?:lin|mac)\z} ) {
    $generate_message = \&generate_message_unix;
} elsif ( $target_os eq "win" ) {
    $generate_message = \&generate_message_windows;
} else {
    runtime_error( "OS \"$target_os\" is not supported" );
}; # if


#
# Do the work.
#

my $data = parse_source( $input_file );
if ( defined( $id ) ) {
    print( $data->{ "%meta" }->{ LangId }, "\n" );
}; # if
if ( defined( $enum_file ) ) {
    generate_enum( $data, $enum_file, $prefix );
}; # if
if ( defined( $signature_file ) ) {
    generate_signature( $data, $signature_file );
}; # if
if ( defined( $default_file ) ) {
    generate_default( $data, $default_file, $prefix );
}; # if
if ( defined( $message_file ) ) {
    $generate_message->( $data, $message_file );
}; # if

exit( 0 );

__END__

=pod

=head1 NAME

B<message-converter.pl> -- Convert message catalog source file into another text forms.

=head1 SYNOPSIS

B<message-converter.pl> I<option>... <file>

=head1 OPTIONS

=over

=item B<--enum-file=>I<file>

Generate enum file named I<file>.

=item B<--default-file=>I<file>

Generate default messages file named I<file>.

=item B<--lang-id>

Print language identifier of the message catalog source file.

=item B<--message-file=>I<file>

Generate message file.

=item B<--signature-file=>I<file>

Generate signature file.

Signatures are used for checking compatibility. For example, to check a primary
catalog and its translation to another language, signatures of both catalogs should be generated
and compared. If signatures are identical, catalogs are compatible.

=item B<--prefix=>I<prefix>

Prefix to be used for all C identifiers (type and variable names) in enum and default messages
files.

=item B<--os=>I<str>

Specify OS name the message formats to be converted for. If not specified explicitly, value of
LIBOMP_OS environment variable is used. If LIBOMP_OS is not defined, host OS is detected.

Depending on OS, B<message-converter.pl> converts message formats to GNU style or MS style.

=item Standard Options

=over

=item B<--doc>

=item B<--manual>

Print full documentation and exit.

=item B<--help>

Print short help message and exit.

=item B<--version>

Print version string and exit.

=back

=back

=head1 ARGUMENTS

=over

=item I<file>

A name of input file.

=back

=head1 DESCRIPTION

=head2 Message Catalog File Format

It is plain text file in UTF-8 encoding. Empty lines and lines beginning with sharp sign (C<#>) are
ignored. EBNF syntax of content:

    catalog    = { section };
    section    = header body;
    header     = "-*- " section-id " -*-" "\n";
    body       = { message };
    message    = message-id string "\n" { string "\n" };
    section-id = identifier;
    message-id = "OBSOLETE" | identifier;
    identifier = letter { letter | digit | "_" };
    string     = """ { character } """;

Identifier starts with letter, with following letters, digits, and underscores. Identifiers are
case-sensitive. Setion identifiers are fixed: C<META>, C<STRINGS>, C<FORMATS>, C<MESSAGES> and
C<HINTS>. Message identifiers must be unique within section. Special C<OBSOLETE> pseudo-identifier
may be used many times.

String is a C string literal which must not cross line boundaries.
Long messages may occupy multiple lines, a string per line.

Message may include printf-like GNU-style placeholders for arguments: C<%I<n>$I<t>>,
where I<n> is argument number (C<1>, C<2>, ...),
I<t> -- argument type, C<s> (string) or C<d> (32-bit integer).

See also comments in F<i18n/en_US.txt>.

=head2 Output Files

This script can generate 3 different text files from single source:

=over

=item Enum file.

Enum file is a C include file, containing definitions of message identifiers, e. g.:

    enum kmp_i18n_id {

        // Set #1, meta.
        kmp_i18n_prp_first = 65536,
        kmp_i18n_prp_Language,
        kmp_i18n_prp_Country,
        kmp_i18n_prp_LangId,
        kmp_i18n_prp_Version,
        kmp_i18n_prp_Revision,
        kmp_i18n_prp_last,

        // Set #2, strings.
        kmp_i18n_str_first = 131072,
        kmp_i18n_str_Error,
        kmp_i18n_str_UnknownFile,
        kmp_i18n_str_NotANumber,
        ...

        // Set #3, formats.
        ...

        kmp_i18n_xxx_lastest

    }; // enum kmp_i18n_id

    typedef enum kmp_i18n_id  kmp_i18n_id_t;

=item Default messages file.

Default messages file is a C include file containing default messages to be embedded into
application (and used if external message catalog does not exist or could not be open):

    static char const *
    __kmp_i18n_default_meta[] =
        {
            NULL,
            "English",
            "USA",
            "1033",
            "2",
            "20090806",
            NULL
        };

    static char const *
    __kmp_i18n_default_strings[] =
        {
            "Error",
            "(unknown file)",
            "not a number",
            ...
            NULL
        };

    ...

=item Message file.

Message file is an input for message compiler, F<gencat> on Linux* OS and OS X*, or F<mc.exe> on
Windows* OS.

Here is the example of Linux* OS message file:

    $quote "
    1 "Japanese"
    2 "Japan"
    3 "1041"
    4 "2"
    5 "Based on English message catalog revision 20090806"
    ...

Example of Windows* OS message file:

    LanguageNames = (Japanese=10041:msg_1041)

    FacilityNames = (
     prp=1
     str=2
     fmt=3
     ...
    )

    MessageId=1
    Facility=prp
    Language=Japanese
    Japanese
    .

    ...

=item Signature.

Signature is a processed source file: comments stripped, strings deleted, but placeholders kept and
sorted.

    -*- FORMATS-*-

    Info                                     %1$d %2$s
    Warning                                  %1$d %2$s
    Fatal                                    %1$d %2$s
    SysErr                                   %1$d %2$s
    Hint                                     %1$- %2$s
    Pragma                                   %1$s %2$s %3$s %4$s

The purpose of signatures -- compare two message source files for compatibility. If signatures of
two message sources are the same, binary message catalogs will be compatible.

=back

=head1 EXAMPLES

Generate include file containing message identifiers:

    $ message-converter.pl --enum-file=kmp_i18n_id.inc en_US.txt

Generate include file contating default messages:

    $ message-converter.pl --default-file=kmp_i18n_default.inc en_US.txt

Generate input file for message compiler, Linux* OS example:

    $ message-converter.pl --message-file=ru_RU.UTF-8.msg ru_RU.txt

Generate input file for message compiler, Windows* OS example:

    > message-converter.pl --message-file=ru_RU.UTF-8.mc ru_RU.txt

=cut

# end of file #