1 /**************************************************************************
2 *
3 *   Copyright (C) 2002-2010, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *
6 ***************************************************************************
7 */
8 
9 //
10 //   ugrep  - an ICU sample program illustrating the use of ICU Regular Expressions.
11 //
12 //            The use of the ICU Regex API all occurs within the main()
13 //            function.  The rest of the code deals with with opening files,
14 //            encoding conversions, printing results, etc.
15 //
16 //            This is not a full-featured grep program.  The command line options
17 //            have been kept to a minimum to avoid complicating the sample code.
18 //
19 
20 
21 
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 
26 #include "unicode/utypes.h"
27 #include "unicode/ustring.h"
28 #include "unicode/regex.h"
29 #include "unicode/ucnv.h"
30 #include "unicode/uclean.h"
31 
32 
33 //
34 //  The following variables contain paramters that may be set from the command line.
35 //
36 const char *pattern = NULL;     // The regular expression
37 int        firstFileNum;        //  argv index of the first file name
38 UBool      displayFileName = FALSE;
39 UBool      displayLineNum  = FALSE;
40 
41 
42 //
43 //  Info regarding the file currently being processed
44 //
45 const char *fileName;
46 int         fileLen;              // Length, in UTF-16 Code Units.
47 
48 UChar      *ucharBuf = 0;         // Buffer, holds converted file.  (Simple minded program, always reads
49                                   //   the whole file at once.
50 
51 char       *charBuf = 0;          // Buffer, for original, unconverted file data.
52 
53 
54 //
55 //  Info regarding the line currently being processed
56 //
57 int      lineStart;     // Index of first char of the current line in the file buffer
58 int      lineEnd;       // Index of char following the new line sequence for the current line
59 int      lineNum;
60 
61 //
62 //  Converter, used on output to convert Unicode data back to char *
63 //             so that it will display in non-Unicode terminal windows.
64 //
65 UConverter  *outConverter = 0;
66 
67 //
68 //  Function forward declarations
69 //
70 void processOptions(int argc, const char **argv);
71 void nextLine(int start);
72 void printMatch();
73 void printUsage();
74 void readFile(const char *name);
75 
76 
77 
78 //------------------------------------------------------------------------------------------
79 //
80 //   main          for ugrep
81 //
82 //           Structurally, all use of the ICU Regular Expression API is in main(),
83 //           and all of the supporting stuff necessary to make a running program, but
84 //           not directly related to regular expressions, is factored out into these other
85 //           functions.
86 //
87 //------------------------------------------------------------------------------------------
main(int argc,const char ** argv)88 int main(int argc, const char** argv) {
89     UBool     matchFound = FALSE;
90 
91     //
92     //  Process the commmand line options.
93     //
94     processOptions(argc, argv);
95 
96     //
97     // Create a RegexPattern object from the user supplied pattern string.
98     //
99     UErrorCode status = U_ZERO_ERROR;   // All ICU operations report success or failure
100                                         //   in a status variable.
101 
102     UParseError    parseErr;            // In the event of a syntax error in the regex pattern,
103                                         //   this struct will contain the position of the
104                                         //   error.
105 
106     RegexPattern  *rePat = RegexPattern::compile(pattern, parseErr, status);
107                                         // Note that C++ is doing an automatic conversion
108                                         //  of the (char *) pattern to a temporary
109                                         //  UnicodeString object.
110     if (U_FAILURE(status)) {
111         fprintf(stderr, "ugrep:  error in pattern: \"%s\" at position %d\n",
112             u_errorName(status), parseErr.offset);
113         exit(-1);
114     }
115 
116     //
117     // Create a RegexMatcher from the newly created pattern.
118     //
119     UnicodeString empty;
120     RegexMatcher *matcher = rePat->matcher(empty, status);
121     if (U_FAILURE(status)) {
122         fprintf(stderr, "ugrep:  error in creating RegexMatcher: \"%s\"\n",
123             u_errorName(status));
124         exit(-1);
125     }
126 
127     //
128     // Loop, processing each of the input files.
129     //
130     for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
131         readFile(argv[fileNum]);
132 
133         //
134         //  Loop through the lines of a file, trying to match the regex pattern on each.
135         //
136         for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
137             UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart);
138             matcher->reset(s);
139             if (matcher->find()) {
140                 matchFound = TRUE;
141                 printMatch();
142             }
143         }
144     }
145 
146     //
147     //  Clean up
148     //
149     delete matcher;
150     delete rePat;
151     free(ucharBuf);
152     free(charBuf);
153     ucnv_close(outConverter);
154 
155     u_cleanup();       // shut down ICU, release any cached data it owns.
156 
157     return matchFound? 0: 1;
158 }
159 
160 
161 
162 //------------------------------------------------------------------------------------------
163 //
164 //   doOptions          Run through the command line options, and set
165 //                      the global variables accordingly.
166 //
167 //                      exit without returning if an error occured and
168 //                      ugrep should not proceed further.
169 //
170 //------------------------------------------------------------------------------------------
processOptions(int argc,const char ** argv)171 void processOptions(int argc, const char **argv) {
172     int            optInd;
173     UBool          doUsage   = FALSE;
174     UBool          doVersion = FALSE;
175     const char    *arg;
176 
177 
178     for(optInd = 1; optInd < argc; ++optInd) {
179         arg = argv[optInd];
180 
181         /* version info */
182         if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) {
183             doVersion = TRUE;
184         }
185         /* usage info */
186         else if(strcmp(arg, "--help") == 0) {
187             doUsage = TRUE;
188         }
189         else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) {
190             displayLineNum = TRUE;
191         }
192         /* POSIX.1 says all arguments after -- are not options */
193         else if(strcmp(arg, "--") == 0) {
194             /* skip the -- */
195             ++optInd;
196             break;
197         }
198         /* unrecognized option */
199         else if(strncmp(arg, "-", strlen("-")) == 0) {
200             printf("ugrep: invalid option -- %s\n", arg+1);
201             doUsage = TRUE;
202         }
203         /* done with options */
204         else {
205             break;
206         }
207     }
208 
209     if (doUsage) {
210         printUsage();
211         exit(0);
212     }
213 
214     if (doVersion) {
215         printf("ugrep version 0.01\n");
216         if (optInd == argc) {
217             exit(0);
218         }
219     }
220 
221     int  remainingArgs = argc-optInd;     // pattern file ...
222     if (remainingArgs < 2) {
223         fprintf(stderr, "ugrep:  files or pattern are missing.\n");
224         printUsage();
225         exit(1);
226     }
227 
228     if (remainingArgs > 2) {
229         // More than one file to be processed.   Display file names with match output.
230         displayFileName = TRUE;
231     }
232 
233     pattern      = argv[optInd];
234     firstFileNum = optInd+1;
235 }
236 
237 //------------------------------------------------------------------------------------------
238 //
239 //   printUsage
240 //
241 //------------------------------------------------------------------------------------------
printUsage()242 void printUsage() {
243     printf("ugrep [options] pattern file...\n"
244         "     -V or --version     display version information\n"
245         "     --help              display this help and exit\n"
246         "     --                  stop further option processing\n"
247         "-n,  --line-number       Prefix each line of output with the line number within its input file.\n"
248         );
249     exit(0);
250 }
251 
252 //------------------------------------------------------------------------------------------
253 //
254 //    readFile          Read a file into memory, and convert it to Unicode.
255 //
256 //                      Since this is just a demo program, take the simple minded approach
257 //                      of always reading the whole file at once.  No intelligent buffering
258 //                      is done.
259 //
260 //------------------------------------------------------------------------------------------
readFile(const char * name)261 void readFile(const char *name) {
262 
263     //
264     //  Initialize global file variables
265     //
266     fileName = name;
267     fileLen  = 0;      // zero length prevents processing in case of errors.
268 
269 
270     //
271     //  Open the file and determine its size.
272     //
273     FILE *file = fopen(name, "rb");
274     if (file == 0 ) {
275         fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
276         return;
277     }
278     fseek(file, 0, SEEK_END);
279     int rawFileLen = ftell(file);
280     fseek(file, 0, SEEK_SET);
281 
282 
283     //
284     //   Read in the file
285     //
286     charBuf    = (char *)realloc(charBuf, rawFileLen+1);   // Need error checking...
287     int t = fread(charBuf, 1, rawFileLen, file);
288     if (t != rawFileLen)  {
289         fprintf(stderr, "Error reading file \"%s\"\n", fileName);
290         fclose(file);
291         return;
292     }
293     charBuf[rawFileLen]=0;
294     fclose(file);
295 
296     //
297     // Look for a Unicode Signature (BOM) in the data
298     //
299     int32_t        signatureLength;
300     const char *   charDataStart = charBuf;
301     UErrorCode     status        = U_ZERO_ERROR;
302     const char*    encoding      = ucnv_detectUnicodeSignature(
303                            charDataStart, rawFileLen, &signatureLength, &status);
304     if (U_FAILURE(status)) {
305         fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
306             u_errorName(status));
307         return;
308     }
309     if(encoding!=NULL ){
310         charDataStart  += signatureLength;
311         rawFileLen     -= signatureLength;
312     }
313 
314     //
315     // Open a converter to take the file to UTF-16
316     //
317     UConverter* conv;
318     conv = ucnv_open(encoding, &status);
319     if (U_FAILURE(status)) {
320         fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
321         return;
322     }
323 
324     //
325     // Convert the file data to UChar.
326     //  Preflight first to determine required buffer size.
327     //
328     uint32_t destCap = ucnv_toUChars(conv,
329                        NULL,           //  dest,
330                        0,              //  destCapacity,
331                        charDataStart,
332                        rawFileLen,
333                        &status);
334     if (status != U_BUFFER_OVERFLOW_ERROR) {
335         fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
336         return;
337     };
338 
339     status = U_ZERO_ERROR;
340     ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
341     ucnv_toUChars(conv,
342         ucharBuf,           //  dest,
343         destCap+1,
344         charDataStart,
345         rawFileLen,
346         &status);
347     if (U_FAILURE(status)) {
348         fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
349         return;
350     };
351     ucnv_close(conv);
352 
353     //
354     //  Successful conversion.  Set the global size variables so that
355     //     the rest of the processing will proceed for this file.
356     //
357     fileLen = destCap;
358 }
359 
360 
361 
362 
363 
364 //------------------------------------------------------------------------------------------
365 //
366 //   nextLine           Advance the line index variables, starting at the
367 //                      specified position in the input file buffer, by
368 //                      scanning forwrd until the next end-of-line.
369 //
370 //                      Need to take into account all of the possible Unicode
371 //                      line ending sequences.
372 //
373 //------------------------------------------------------------------------------------------
nextLine(int startPos)374 void nextLine(int  startPos) {
375     if (startPos == 0) {
376         lineNum = 0;
377     } else {
378         lineNum++;
379     }
380     lineStart = lineEnd = startPos;
381 
382     for (;;) {
383         if (lineEnd >= fileLen) {
384             return;
385         }
386         UChar c = ucharBuf[lineEnd];
387         lineEnd++;
388         if (c == 0x0a   ||       // Line Feed
389             c == 0x0c   ||       // Form Feed
390             c == 0x0d   ||       // Carriage Return
391             c == 0x85   ||       // Next Line
392             c == 0x2028 ||       // Line Separator
393             c == 0x2029)         // Paragraph separator
394         {
395             break;
396         }
397     }
398 
399     // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
400     if (lineEnd < fileLen           &&
401         ucharBuf[lineEnd-1] == 0x0d &&
402         ucharBuf[lineEnd]   == 0x0a)
403     {
404         lineEnd++;
405     }
406 }
407 
408 
409 //------------------------------------------------------------------------------------------
410 //
411 //   printMatch         Called when a matching line has been located.
412 //                      Print out the line from the file with the match, after
413 //                         converting it back to the default code page.
414 //
415 //------------------------------------------------------------------------------------------
printMatch()416 void printMatch() {
417     char                buf[2000];
418     UErrorCode         status       = U_ZERO_ERROR;
419 
420     // If we haven't already created a converter for output, do it now.
421     if (outConverter == 0) {
422         outConverter = ucnv_open(NULL, &status);
423         if (U_FAILURE(status)) {
424             fprintf(stderr, "ugrep:  Error opening default converter: \"%s\"\n",
425                 u_errorName(status));
426             exit(-1);
427         }
428     };
429 
430     // Convert the line to be printed back to the default 8 bit code page.
431     //   If the line is too long for our buffer, just truncate it.
432     ucnv_fromUChars(outConverter,
433                     buf,                   // destination buffer for conversion
434                     sizeof(buf),           // capacity of destination buffer
435                     &ucharBuf[lineStart],   // Input to conversion
436                     lineEnd-lineStart,     // number of UChars to convert
437                     &status);
438     buf[sizeof(buf)-1] = 0;                // Add null for use in case of too long lines.
439                                            // The converter null-terminates its output unless
440                                            //   the buffer completely fills.
441 
442     if (displayFileName) {
443         printf("%s:", fileName);
444     }
445     if (displayLineNum) {
446         printf("%d:", lineNum);
447     }
448     printf("%s", buf);
449 }
450 
451