1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // Feature modeling language (fml) parser.
18 //
19 // BNF grammar for fml:
20 //
21 // <feature model> ::= { <feature extractor> }
22 //
23 // <feature extractor> ::= <extractor spec> |
24 //                         <extractor spec> '.' <feature extractor> |
25 //                         <extractor spec> '{' { <feature extractor> } '}'
26 //
27 // <extractor spec> ::= <extractor type>
28 //                      [ '(' <parameter list> ')' ]
29 //                      [ ':' <extractor name> ]
30 //
31 // <parameter list> = ( <parameter> | <argument> ) { ',' <parameter> }
32 //
33 // <parameter> ::= <parameter name> '=' <parameter value>
34 //
35 // <extractor type> ::= NAME
36 // <extractor name> ::= NAME | STRING
37 // <argument> ::= NUMBER
38 // <parameter name> ::= NAME
39 // <parameter value> ::= NUMBER | STRING | NAME
40 
41 #ifndef LIBTEXTCLASSIFIER_COMMON_FML_PARSER_H_
42 #define LIBTEXTCLASSIFIER_COMMON_FML_PARSER_H_
43 
44 #include <string>
45 #include <vector>
46 
47 #include "common/feature-descriptors.h"
48 #include "util/base/logging.h"
49 
50 namespace libtextclassifier {
51 namespace nlp_core {
52 
53 class FMLParser {
54  public:
55   // Parses fml specification into feature extractor descriptor.
56   // Returns true on success, false on error (e.g., syntax errors).
57   bool Parse(const std::string &source, FeatureExtractorDescriptor *result);
58 
59  private:
60   // Initializes the parser with the source text.
61   // Returns true on success, false on syntax error.
62   bool Initialize(const std::string &source);
63 
64   // Outputs an error message, with context info, and sets error_ to true.
65   void ReportError(const std::string &error_message);
66 
67   // Moves to the next input character.
68   void Next();
69 
70   // Moves to the next input item.  Sets item_text_ and item_type_ accordingly.
71   // Returns true on success, false on syntax error.
72   bool NextItem();
73 
74   // Parses a feature descriptor.
75   // Returns true on success, false on syntax error.
76   bool ParseFeature(FeatureFunctionDescriptor *result);
77 
78   // Parses a parameter specification.
79   // Returns true on success, false on syntax error.
80   bool ParseParameter(FeatureFunctionDescriptor *result);
81 
82   // Returns true if end of source input has been reached.
eos()83   bool eos() const { return current_ >= source_.end(); }
84 
85   // Returns current character.  Other methods should access the current
86   // character through this method (instead of using *current_ directly): this
87   // method performs extra safety checks.
88   //
89   // In case of an unsafe access, returns '\0'.
CurrentChar()90   char CurrentChar() const {
91     if ((current_ >= source_.begin()) && (current_ < source_.end())) {
92       return *current_;
93     } else {
94       TC_LOG(ERROR) << "Unsafe char read";
95       return '\0';
96     }
97   }
98 
99   // Item types.
100   enum ItemTypes {
101     END = 0,
102     NAME = -1,
103     NUMBER = -2,
104     STRING = -3,
105   };
106 
107   // Source text.
108   std::string source_;
109 
110   // Current input position.
111   std::string::iterator current_;
112 
113   // Line number for current input position.
114   int line_number_;
115 
116   // Start position for current item.
117   std::string::iterator item_start_;
118 
119   // Start position for current line.
120   std::string::iterator line_start_;
121 
122   // Line number for current item.
123   int item_line_number_;
124 
125   // Item type for current item. If this is positive it is interpreted as a
126   // character. If it is negative it is interpreted as an item type.
127   int item_type_;
128 
129   // Text for current item.
130   std::string item_text_;
131 };
132 
133 // Converts a FeatureFunctionDescriptor into an FML spec (reverse of parsing).
134 void ToFML(const FeatureFunctionDescriptor &function, std::string *output);
135 
136 // Like ToFML, but doesn't go into the nested functions.  Instead, it generates
137 // a string that starts with the name of the feature extraction function and
138 // next, in-between parentheses, the parameters, separated by comma.
139 // Intuitively, the constructed string is the prefix of ToFML, before the "{"
140 // that starts the nested features.
141 void ToFMLFunction(const FeatureFunctionDescriptor &function,
142                    std::string *output);
143 
144 }  // namespace nlp_core
145 }  // namespace libtextclassifier
146 
147 #endif  // LIBTEXTCLASSIFIER_COMMON_FML_PARSER_H_
148