1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  *   Copyright (C) 2003-2014, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  *******************************************************************************
8  *
9  * File prscmnts.cpp
10  *
11  * Modification History:
12  *
13  *   Date          Name        Description
14  *   08/22/2003    ram         Creation.
15  *******************************************************************************
16  */
17 
18 // Safer use of UnicodeString.
19 #ifndef UNISTR_FROM_CHAR_EXPLICIT
20 #   define UNISTR_FROM_CHAR_EXPLICIT explicit
21 #endif
22 
23 // Less important, but still a good idea.
24 #ifndef UNISTR_FROM_STRING_EXPLICIT
25 #   define UNISTR_FROM_STRING_EXPLICIT explicit
26 #endif
27 
28 #include "unicode/regex.h"
29 #include "unicode/unistr.h"
30 #include "unicode/parseerr.h"
31 #include "prscmnts.h"
32 #include <stdio.h>
33 #include <stdlib.h>
34 
35 U_NAMESPACE_USE
36 
37 #if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when RegularExpressions not available */
38 
39 #define MAX_SPLIT_STRINGS 20
40 
41 const char *patternStrings[UPC_LIMIT]={
42     "^translate\\s*(.*)",
43     "^note\\s*(.*)"
44 };
45 
46 U_CFUNC int32_t
removeText(UChar * source,int32_t srcLen,UnicodeString patString,uint32_t options,UnicodeString replaceText,UErrorCode * status)47 removeText(UChar *source, int32_t srcLen,
48            UnicodeString patString,uint32_t options,
49            UnicodeString replaceText, UErrorCode *status){
50 
51     if(status == NULL || U_FAILURE(*status)){
52         return 0;
53     }
54 
55     UnicodeString src(source, srcLen);
56 
57     RegexMatcher    myMatcher(patString, src, options, *status);
58     if(U_FAILURE(*status)){
59         return 0;
60     }
61     UnicodeString dest;
62 
63 
64     dest = myMatcher.replaceAll(replaceText,*status);
65 
66 
67     return dest.extract(source, srcLen, *status);
68 
69 }
70 U_CFUNC int32_t
trim(UChar * src,int32_t srcLen,UErrorCode * status)71 trim(UChar *src, int32_t srcLen, UErrorCode *status){
72      srcLen = removeText(src, srcLen, UnicodeString("^[ \\r\\n]+ "), 0, UnicodeString(), status); // remove leading new lines
73      srcLen = removeText(src, srcLen, UnicodeString("^\\s+"), 0, UnicodeString(), status); // remove leading spaces
74      srcLen = removeText(src, srcLen, UnicodeString("\\s+$"), 0, UnicodeString(), status); // remvoe trailing spcaes
75      return srcLen;
76 }
77 
78 U_CFUNC int32_t
removeCmtText(UChar * source,int32_t srcLen,UErrorCode * status)79 removeCmtText(UChar* source, int32_t srcLen, UErrorCode* status){
80     srcLen = trim(source, srcLen, status);
81     UnicodeString patString("^\\s*?\\*\\s*?");  // remove pattern like " * " at the begining of the line
82     srcLen = removeText(source, srcLen, patString, UREGEX_MULTILINE, UnicodeString(), status);
83     return removeText(source, srcLen, UnicodeString("[ \\r\\n]+"), 0, UnicodeString(" "), status);// remove new lines;
84 }
85 
86 U_CFUNC int32_t
getText(const UChar * source,int32_t srcLen,UChar ** dest,int32_t destCapacity,UnicodeString patternString,UErrorCode * status)87 getText(const UChar* source, int32_t srcLen,
88         UChar** dest, int32_t destCapacity,
89         UnicodeString patternString,
90         UErrorCode* status){
91 
92     if(status == NULL || U_FAILURE(*status)){
93         return 0;
94     }
95 
96     UnicodeString     stringArray[MAX_SPLIT_STRINGS];
97     RegexPattern      *pattern = RegexPattern::compile(UnicodeString("@"), 0, *status);
98     UnicodeString src (source,srcLen);
99 
100     if (U_FAILURE(*status)) {
101         return 0;
102     }
103     pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status);
104 
105     RegexMatcher matcher(patternString, UREGEX_DOTALL, *status);
106     if (U_FAILURE(*status)) {
107         return 0;
108     }
109     for(int32_t i=0; i<MAX_SPLIT_STRINGS; i++){
110         matcher.reset(stringArray[i]);
111         if(matcher.lookingAt(*status)){
112             UnicodeString out = matcher.group(1, *status);
113 
114             return out.extract(*dest, destCapacity,*status);
115         }
116     }
117     return 0;
118 }
119 
120 
121 #define AT_SIGN  0x0040
122 
123 U_CFUNC int32_t
getDescription(const UChar * source,int32_t srcLen,UChar ** dest,int32_t destCapacity,UErrorCode * status)124 getDescription( const UChar* source, int32_t srcLen,
125                 UChar** dest, int32_t destCapacity,
126                 UErrorCode* status){
127     if(status == NULL || U_FAILURE(*status)){
128         return 0;
129     }
130 
131     UnicodeString     stringArray[MAX_SPLIT_STRINGS];
132     RegexPattern      *pattern = RegexPattern::compile(UnicodeString("@"), UREGEX_MULTILINE, *status);
133     UnicodeString src(source, srcLen);
134 
135     if (U_FAILURE(*status)) {
136         return 0;
137     }
138     pattern->split(src, stringArray,MAX_SPLIT_STRINGS , *status);
139 
140     if(stringArray[0].indexOf((UChar)AT_SIGN)==-1){
141         int32_t destLen =  stringArray[0].extract(*dest, destCapacity, *status);
142         return trim(*dest, destLen, status);
143     }
144     return 0;
145 }
146 
147 U_CFUNC int32_t
getCount(const UChar * source,int32_t srcLen,UParseCommentsOption option,UErrorCode * status)148 getCount(const UChar* source, int32_t srcLen,
149          UParseCommentsOption option, UErrorCode *status){
150 
151     if(status == NULL || U_FAILURE(*status)){
152         return 0;
153     }
154 
155     UnicodeString     stringArray[MAX_SPLIT_STRINGS];
156     RegexPattern      *pattern = RegexPattern::compile(UnicodeString("@"), UREGEX_MULTILINE, *status);
157     UnicodeString src (source, srcLen);
158 
159 
160     if (U_FAILURE(*status)) {
161         return 0;
162     }
163     int32_t retLen = pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status);
164 
165     UnicodeString patternString(patternStrings[option]);
166     RegexMatcher matcher(patternString, UREGEX_DOTALL, *status);
167     if (U_FAILURE(*status)) {
168         return 0;
169     }
170     int32_t count = 0;
171     for(int32_t i=0; i<retLen; i++){
172         matcher.reset(stringArray[i]);
173         if(matcher.lookingAt(*status)){
174             count++;
175         }
176     }
177     if(option == UPC_TRANSLATE && count > 1){
178         fprintf(stderr, "Multiple @translate tags cannot be supported.\n");
179         exit(U_UNSUPPORTED_ERROR);
180     }
181     return count;
182 }
183 
184 U_CFUNC int32_t
getAt(const UChar * source,int32_t srcLen,UChar ** dest,int32_t destCapacity,int32_t index,UParseCommentsOption option,UErrorCode * status)185 getAt(const UChar* source, int32_t srcLen,
186         UChar** dest, int32_t destCapacity,
187         int32_t index,
188         UParseCommentsOption option,
189         UErrorCode* status){
190 
191     if(status == NULL || U_FAILURE(*status)){
192         return 0;
193     }
194 
195     UnicodeString     stringArray[MAX_SPLIT_STRINGS];
196     RegexPattern      *pattern = RegexPattern::compile(UnicodeString("@"), UREGEX_MULTILINE, *status);
197     UnicodeString src (source, srcLen);
198 
199 
200     if (U_FAILURE(*status)) {
201         return 0;
202     }
203     int32_t retLen = pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status);
204 
205     UnicodeString patternString(patternStrings[option]);
206     RegexMatcher matcher(patternString, UREGEX_DOTALL, *status);
207     if (U_FAILURE(*status)) {
208         return 0;
209     }
210     int32_t count = 0;
211     for(int32_t i=0; i<retLen; i++){
212         matcher.reset(stringArray[i]);
213         if(matcher.lookingAt(*status)){
214             if(count == index){
215                 UnicodeString out = matcher.group(1, *status);
216                 return out.extract(*dest, destCapacity,*status);
217             }
218             count++;
219 
220         }
221     }
222     return 0;
223 
224 }
225 
226 U_CFUNC int32_t
getTranslate(const UChar * source,int32_t srcLen,UChar ** dest,int32_t destCapacity,UErrorCode * status)227 getTranslate( const UChar* source, int32_t srcLen,
228               UChar** dest, int32_t destCapacity,
229               UErrorCode* status){
230     UnicodeString     notePatternString("^translate\\s*?(.*)");
231 
232     int32_t destLen = getText(source, srcLen, dest, destCapacity, notePatternString, status);
233     return trim(*dest, destLen, status);
234 }
235 
236 U_CFUNC int32_t
getNote(const UChar * source,int32_t srcLen,UChar ** dest,int32_t destCapacity,UErrorCode * status)237 getNote(const UChar* source, int32_t srcLen,
238         UChar** dest, int32_t destCapacity,
239         UErrorCode* status){
240 
241     UnicodeString     notePatternString("^note\\s*?(.*)");
242     int32_t destLen =  getText(source, srcLen, dest, destCapacity, notePatternString, status);
243     return trim(*dest, destLen, status);
244 
245 }
246 
247 #endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */
248 
249