1 /*
2 *******************************************************************************
3 *   Copyright (C) 2011-2014, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  ppucd.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2011dec11
12 *   created by: Markus W. Scherer
13 */
14 
15 #include "unicode/utypes.h"
16 #include "unicode/uchar.h"
17 #include "charstr.h"
18 #include "cstring.h"
19 #include "ppucd.h"
20 #include "uassert.h"
21 #include "uparse.h"
22 
23 #include <stdio.h>
24 #include <string.h>
25 
26 U_NAMESPACE_BEGIN
27 
~PropertyNames()28 PropertyNames::~PropertyNames() {}
29 
30 int32_t
getPropertyEnum(const char * name) const31 PropertyNames::getPropertyEnum(const char *name) const {
32     return u_getPropertyEnum(name);
33 }
34 
35 int32_t
getPropertyValueEnum(int32_t property,const char * name) const36 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
37     return u_getPropertyValueEnum((UProperty)property, name);
38 }
39 
UniProps()40 UniProps::UniProps()
41         : start(U_SENTINEL), end(U_SENTINEL),
42           bmg(U_SENTINEL), bpb(U_SENTINEL),
43           scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
44           digitValue(-1), numericValue(NULL),
45           name(NULL), nameAlias(NULL) {
46     memset(binProps, 0, sizeof(binProps));
47     memset(intProps, 0, sizeof(intProps));
48     memset(age, 0, 4);
49 }
50 
~UniProps()51 UniProps::~UniProps() {}
52 
53 const int32_t PreparsedUCD::kNumLineBuffers;
54 
PreparsedUCD(const char * filename,UErrorCode & errorCode)55 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
56         : icuPnames(new PropertyNames()), pnames(icuPnames),
57           file(NULL),
58           defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
59           lineNumber(0),
60           lineType(NO_LINE),
61           fieldLimit(NULL), lineLimit(NULL) {
62     if(U_FAILURE(errorCode)) { return; }
63 
64     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
65         filename=NULL;
66         file=stdin;
67     } else {
68         file=fopen(filename, "r");
69     }
70     if(file==NULL) {
71         perror("error opening preparsed UCD");
72         fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
73         errorCode=U_FILE_ACCESS_ERROR;
74         return;
75     }
76 
77     memset(ucdVersion, 0, 4);
78     lines[0][0]=0;
79 }
80 
~PreparsedUCD()81 PreparsedUCD::~PreparsedUCD() {
82     if(file!=stdin) {
83         fclose(file);
84     }
85     delete icuPnames;
86 }
87 
88 // Same order as the LineType values.
89 static const char *lineTypeStrings[]={
90     NULL,
91     NULL,
92     "ucd",
93     "property",
94     "binary",
95     "value",
96     "defaults",
97     "block",
98     "cp",
99     "algnamesrange"
100 };
101 
102 PreparsedUCD::LineType
readLine(UErrorCode & errorCode)103 PreparsedUCD::readLine(UErrorCode &errorCode) {
104     if(U_FAILURE(errorCode)) { return NO_LINE; }
105     // Select the next available line buffer.
106     while(!isLineBufferAvailable(lineIndex)) {
107         ++lineIndex;
108         if (lineIndex == kNumLineBuffers) {
109             lineIndex = 0;
110         }
111     }
112     char *line=lines[lineIndex];
113     *line=0;
114     lineLimit=fieldLimit=line;
115     lineType=NO_LINE;
116     char *result=fgets(line, sizeof(lines[0]), file);
117     if(result==NULL) {
118         if(ferror(file)) {
119             perror("error reading preparsed UCD");
120             fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
121             errorCode=U_FILE_ACCESS_ERROR;
122         }
123         return NO_LINE;
124     }
125     ++lineNumber;
126     if(*line=='#') {
127         fieldLimit=strchr(line, 0);
128         return lineType=EMPTY_LINE;
129     }
130     // Remove trailing /r/n.
131     char c;
132     char *limit=strchr(line, 0);
133     while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
134     // Remove trailing white space.
135     while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
136     *limit=0;
137     lineLimit=limit;
138     if(line==limit) {
139         fieldLimit=limit;
140         return lineType=EMPTY_LINE;
141     }
142     // Split by ';'.
143     char *semi=line;
144     while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
145     fieldLimit=strchr(line, 0);
146     // Determine the line type.
147     int32_t type;
148     for(type=EMPTY_LINE+1;; ++type) {
149         if(type==LINE_TYPE_COUNT) {
150             fprintf(stderr,
151                     "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
152                     line, (long)lineNumber);
153             errorCode=U_PARSE_ERROR;
154             return NO_LINE;
155         }
156         if(0==strcmp(line, lineTypeStrings[type])) {
157             break;
158         }
159     }
160     lineType=(LineType)type;
161     if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
162         u_versionFromString(ucdVersion, fieldLimit+1);
163     }
164     return lineType;
165 }
166 
167 const char *
firstField()168 PreparsedUCD::firstField() {
169     char *field=lines[lineIndex];
170     fieldLimit=strchr(field, 0);
171     return field;
172 }
173 
174 const char *
nextField()175 PreparsedUCD::nextField() {
176     if(fieldLimit==lineLimit) { return NULL; }
177     char *field=fieldLimit+1;
178     fieldLimit=strchr(field, 0);
179     return field;
180 }
181 
182 const UniProps *
getProps(UnicodeSet & newValues,UErrorCode & errorCode)183 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
184     if(U_FAILURE(errorCode)) { return NULL; }
185     newValues.clear();
186     if(!lineHasPropertyValues()) {
187         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
188         return NULL;
189     }
190     firstField();
191     const char *field=nextField();
192     if(field==NULL) {
193         // No range field after the type.
194         fprintf(stderr,
195                 "error in preparsed UCD: missing default/block/cp range field "
196                 "(no second field) on line %ld\n",
197                 (long)lineNumber);
198         errorCode=U_PARSE_ERROR;
199         return NULL;
200     }
201     UChar32 start, end;
202     if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
203     UniProps *props;
204     switch(lineType) {
205     case DEFAULTS_LINE:
206         if(defaultLineIndex>=0) {
207             fprintf(stderr,
208                     "error in preparsed UCD: second line with default properties on line %ld\n",
209                     (long)lineNumber);
210             errorCode=U_PARSE_ERROR;
211             return NULL;
212         }
213         if(start!=0 || end!=0x10ffff) {
214             fprintf(stderr,
215                     "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
216                     field, (long)lineNumber);
217             errorCode=U_PARSE_ERROR;
218             return NULL;
219         }
220         props=&defaultProps;
221         defaultLineIndex=lineIndex;
222         break;
223     case BLOCK_LINE:
224         blockProps=defaultProps;  // Block inherits default properties.
225         props=&blockProps;
226         blockLineIndex=lineIndex;
227         break;
228     case CP_LINE:
229         if(blockProps.start<=start && end<=blockProps.end) {
230             // Code point range fully inside the last block inherits the block properties.
231             cpProps=blockProps;
232         } else if(start>blockProps.end || end<blockProps.start) {
233             // Code point range fully outside the last block inherits the default properties.
234             cpProps=defaultProps;
235         } else {
236             // Code point range partially overlapping with the last block is illegal.
237             fprintf(stderr,
238                     "error in preparsed UCD: cp range %s on line %ld only "
239                     "partially overlaps with block range %04lX..%04lX\n",
240                     field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
241             errorCode=U_PARSE_ERROR;
242             return NULL;
243         }
244         props=&cpProps;
245         break;
246     default:
247         // Will not occur because of the range check above.
248         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
249         return NULL;
250     }
251     props->start=start;
252     props->end=end;
253     while((field=nextField())!=NULL) {
254         if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
255     }
256     return props;
257 }
258 
259 static const struct {
260     const char *name;
261     int32_t prop;
262 } ppucdProperties[]={
263     { "Name_Alias", PPUCD_NAME_ALIAS },
264     { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
265     { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
266 };
267 
268 // Returns TRUE for "ok to continue parsing fields".
269 UBool
parseProperty(UniProps & props,const char * field,UnicodeSet & newValues,UErrorCode & errorCode)270 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
271                             UErrorCode &errorCode) {
272     CharString pBuffer;
273     const char *p=field;
274     const char *v=strchr(p, '=');
275     int binaryValue;
276     if(*p=='-') {
277         if(v!=NULL) {
278             fprintf(stderr,
279                     "error in preparsed UCD: mix of binary-property-no and "
280                     "enum-property syntax '%s' on line %ld\n",
281                     field, (long)lineNumber);
282             errorCode=U_PARSE_ERROR;
283             return FALSE;
284         }
285         binaryValue=0;
286         ++p;
287     } else if(v==NULL) {
288         binaryValue=1;
289     } else {
290         binaryValue=-1;
291         // Copy out the property name rather than modifying the field (writing a NUL).
292         pBuffer.append(p, (int32_t)(v-p), errorCode);
293         p=pBuffer.data();
294         ++v;
295     }
296     int32_t prop=pnames->getPropertyEnum(p);
297     if(prop<0) {
298         for(int32_t i=0;; ++i) {
299             if(i==UPRV_LENGTHOF(ppucdProperties)) {
300                 // Ignore unknown property names.
301                 return TRUE;
302             }
303             if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
304                 prop=ppucdProperties[i].prop;
305                 U_ASSERT(prop>=0);
306                 break;
307             }
308         }
309     }
310     if(prop<UCHAR_BINARY_LIMIT) {
311         if(binaryValue>=0) {
312             props.binProps[prop]=(UBool)binaryValue;
313         } else {
314             // No binary value for a binary property.
315             fprintf(stderr,
316                     "error in preparsed UCD: enum-property syntax '%s' "
317                     "for binary property on line %ld\n",
318                     field, (long)lineNumber);
319             errorCode=U_PARSE_ERROR;
320         }
321     } else if(binaryValue>=0) {
322         // Binary value for a non-binary property.
323         fprintf(stderr,
324                 "error in preparsed UCD: binary-property syntax '%s' "
325                 "for non-binary property on line %ld\n",
326                 field, (long)lineNumber);
327         errorCode=U_PARSE_ERROR;
328     } else if (prop < UCHAR_INT_START) {
329         fprintf(stderr,
330                 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
331                 prop, (long)lineNumber);
332         errorCode=U_PARSE_ERROR;
333     } else if(prop<UCHAR_INT_LIMIT) {
334         int32_t value=pnames->getPropertyValueEnum(prop, v);
335         if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
336             // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
337             char *end;
338             unsigned long ccc=uprv_strtoul(v, &end, 10);
339             if(v<end && *end==0 && ccc<=254) {
340                 value=(int32_t)ccc;
341             }
342         }
343         if(value==UCHAR_INVALID_CODE) {
344             fprintf(stderr,
345                     "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
346                     field, (long)lineNumber);
347             errorCode=U_PARSE_ERROR;
348         } else {
349             props.intProps[prop-UCHAR_INT_START]=value;
350         }
351     } else if(*v=='<') {
352         // Do not parse default values like <code point>, just set null values.
353         switch(prop) {
354         case UCHAR_BIDI_MIRRORING_GLYPH:
355             props.bmg=U_SENTINEL;
356             break;
357         case UCHAR_BIDI_PAIRED_BRACKET:
358             props.bpb=U_SENTINEL;
359             break;
360         case UCHAR_SIMPLE_CASE_FOLDING:
361             props.scf=U_SENTINEL;
362             break;
363         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
364             props.slc=U_SENTINEL;
365             break;
366         case UCHAR_SIMPLE_TITLECASE_MAPPING:
367             props.stc=U_SENTINEL;
368             break;
369         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
370             props.suc=U_SENTINEL;
371             break;
372         case UCHAR_CASE_FOLDING:
373             props.cf.remove();
374             break;
375         case UCHAR_LOWERCASE_MAPPING:
376             props.lc.remove();
377             break;
378         case UCHAR_TITLECASE_MAPPING:
379             props.tc.remove();
380             break;
381         case UCHAR_UPPERCASE_MAPPING:
382             props.uc.remove();
383             break;
384         case UCHAR_SCRIPT_EXTENSIONS:
385             props.scx.clear();
386             break;
387         default:
388             fprintf(stderr,
389                     "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
390                     field, (long)lineNumber);
391             errorCode=U_PARSE_ERROR;
392         }
393     } else {
394         char c;
395         switch(prop) {
396         case UCHAR_NUMERIC_VALUE:
397             props.numericValue=v;
398             c=*v;
399             if('0'<=c && c<='9' && v[1]==0) {
400                 props.digitValue=c-'0';
401             } else {
402                 props.digitValue=-1;
403             }
404             break;
405         case UCHAR_NAME:
406             props.name=v;
407             break;
408         case UCHAR_AGE:
409             u_versionFromString(props.age, v);  // Writes 0.0.0.0 if v is not numeric.
410             break;
411         case UCHAR_BIDI_MIRRORING_GLYPH:
412             props.bmg=parseCodePoint(v, errorCode);
413             break;
414         case UCHAR_BIDI_PAIRED_BRACKET:
415             props.bpb=parseCodePoint(v, errorCode);
416             break;
417         case UCHAR_SIMPLE_CASE_FOLDING:
418             props.scf=parseCodePoint(v, errorCode);
419             break;
420         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
421             props.slc=parseCodePoint(v, errorCode);
422             break;
423         case UCHAR_SIMPLE_TITLECASE_MAPPING:
424             props.stc=parseCodePoint(v, errorCode);
425             break;
426         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
427             props.suc=parseCodePoint(v, errorCode);
428             break;
429         case UCHAR_CASE_FOLDING:
430             parseString(v, props.cf, errorCode);
431             break;
432         case UCHAR_LOWERCASE_MAPPING:
433             parseString(v, props.lc, errorCode);
434             break;
435         case UCHAR_TITLECASE_MAPPING:
436             parseString(v, props.tc, errorCode);
437             break;
438         case UCHAR_UPPERCASE_MAPPING:
439             parseString(v, props.uc, errorCode);
440             break;
441         case PPUCD_NAME_ALIAS:
442             props.nameAlias=v;
443             break;
444         case PPUCD_CONDITIONAL_CASE_MAPPINGS:
445         case PPUCD_TURKIC_CASE_FOLDING:
446             // No need to parse their values: They are hardcoded in the runtime library.
447             break;
448         case UCHAR_SCRIPT_EXTENSIONS:
449             parseScriptExtensions(v, props.scx, errorCode);
450             break;
451         default:
452             // Ignore unhandled properties.
453             return TRUE;
454         }
455     }
456     if(U_SUCCESS(errorCode)) {
457         newValues.add((UChar32)prop);
458         return TRUE;
459     } else {
460         return FALSE;
461     }
462 }
463 
464 UBool
getRangeForAlgNames(UChar32 & start,UChar32 & end,UErrorCode & errorCode)465 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
466     if(U_FAILURE(errorCode)) { return FALSE; }
467     if(lineType!=ALG_NAMES_RANGE_LINE) {
468         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
469         return FALSE;
470     }
471     firstField();
472     const char *field=nextField();
473     if(field==NULL) {
474         // No range field after the type.
475         fprintf(stderr,
476                 "error in preparsed UCD: missing algnamesrange range field "
477                 "(no second field) on line %ld\n",
478                 (long)lineNumber);
479         errorCode=U_PARSE_ERROR;
480         return FALSE;
481     }
482     return parseCodePointRange(field, start, end, errorCode);
483 }
484 
485 UChar32
parseCodePoint(const char * s,UErrorCode & errorCode)486 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
487     char *end;
488     uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
489     if(end<=s || *end!=0 || value>=0x110000) {
490         fprintf(stderr,
491                 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
492                 s, (long)lineNumber);
493         errorCode=U_PARSE_ERROR;
494         return U_SENTINEL;
495     }
496     return (UChar32)value;
497 }
498 
499 UBool
parseCodePointRange(const char * s,UChar32 & start,UChar32 & end,UErrorCode & errorCode)500 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
501     uint32_t st, e;
502     u_parseCodePointRange(s, &st, &e, &errorCode);
503     if(U_FAILURE(errorCode)) {
504         fprintf(stderr,
505                 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
506                 s, (long)lineNumber);
507         return FALSE;
508     }
509     start=(UChar32)st;
510     end=(UChar32)e;
511     return TRUE;
512 }
513 
514 void
parseString(const char * s,UnicodeString & uni,UErrorCode & errorCode)515 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
516     UChar *buffer=uni.getBuffer(-1);
517     int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
518     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
519         errorCode=U_ZERO_ERROR;
520         uni.releaseBuffer(0);
521         buffer=uni.getBuffer(length);
522         length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
523     }
524     uni.releaseBuffer(length);
525     if(U_FAILURE(errorCode)) {
526         fprintf(stderr,
527                 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
528                 s, (long)lineNumber);
529     }
530 }
531 
532 void
parseScriptExtensions(const char * s,UnicodeSet & scx,UErrorCode & errorCode)533 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
534     if(U_FAILURE(errorCode)) { return; }
535     scx.clear();
536     CharString scString;
537     for(;;) {
538         const char *scs;
539         const char *scLimit=strchr(s, ' ');
540         if(scLimit!=NULL) {
541             scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
542             if(U_FAILURE(errorCode)) { return; }
543         } else {
544             scs=s;
545         }
546         int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
547         if(script==UCHAR_INVALID_CODE) {
548             fprintf(stderr,
549                     "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
550                     scs, (long)lineNumber);
551             errorCode=U_PARSE_ERROR;
552             return;
553         } else if(scx.contains(script)) {
554             fprintf(stderr,
555                     "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
556                     scs, (long)lineNumber);
557             errorCode=U_PARSE_ERROR;
558             return;
559         } else {
560             scx.add(script);
561         }
562         if(scLimit!=NULL) {
563             s=scLimit+1;
564         } else {
565             break;
566         }
567     }
568     if(scx.isEmpty()) {
569         fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
570         errorCode=U_PARSE_ERROR;
571     }
572 }
573 
574 U_NAMESPACE_END
575