1 /*
2 *******************************************************************************
3 * Copyright (C) 2011-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ppucd.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2011dec11
12 * created by: Markus W. Scherer
13 */
14
15 #include "unicode/utypes.h"
16 #include "unicode/uchar.h"
17 #include "charstr.h"
18 #include "cstring.h"
19 #include "ppucd.h"
20 #include "uassert.h"
21 #include "uparse.h"
22
23 #include <stdio.h>
24 #include <string.h>
25
26 U_NAMESPACE_BEGIN
27
~PropertyNames()28 PropertyNames::~PropertyNames() {}
29
30 int32_t
getPropertyEnum(const char * name) const31 PropertyNames::getPropertyEnum(const char *name) const {
32 return u_getPropertyEnum(name);
33 }
34
35 int32_t
getPropertyValueEnum(int32_t property,const char * name) const36 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
37 return u_getPropertyValueEnum((UProperty)property, name);
38 }
39
UniProps()40 UniProps::UniProps()
41 : start(U_SENTINEL), end(U_SENTINEL),
42 bmg(U_SENTINEL), bpb(U_SENTINEL),
43 scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
44 digitValue(-1), numericValue(NULL),
45 name(NULL), nameAlias(NULL) {
46 memset(binProps, 0, sizeof(binProps));
47 memset(intProps, 0, sizeof(intProps));
48 memset(age, 0, 4);
49 }
50
~UniProps()51 UniProps::~UniProps() {}
52
53 const int32_t PreparsedUCD::kNumLineBuffers;
54
PreparsedUCD(const char * filename,UErrorCode & errorCode)55 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
56 : icuPnames(new PropertyNames()), pnames(icuPnames),
57 file(NULL),
58 defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
59 lineNumber(0),
60 lineType(NO_LINE),
61 fieldLimit(NULL), lineLimit(NULL) {
62 if(U_FAILURE(errorCode)) { return; }
63
64 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
65 filename=NULL;
66 file=stdin;
67 } else {
68 file=fopen(filename, "r");
69 }
70 if(file==NULL) {
71 perror("error opening preparsed UCD");
72 fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
73 errorCode=U_FILE_ACCESS_ERROR;
74 return;
75 }
76
77 memset(ucdVersion, 0, 4);
78 lines[0][0]=0;
79 }
80
~PreparsedUCD()81 PreparsedUCD::~PreparsedUCD() {
82 if(file!=stdin) {
83 fclose(file);
84 }
85 delete icuPnames;
86 }
87
88 // Same order as the LineType values.
89 static const char *lineTypeStrings[]={
90 NULL,
91 NULL,
92 "ucd",
93 "property",
94 "binary",
95 "value",
96 "defaults",
97 "block",
98 "cp",
99 "algnamesrange"
100 };
101
102 PreparsedUCD::LineType
readLine(UErrorCode & errorCode)103 PreparsedUCD::readLine(UErrorCode &errorCode) {
104 if(U_FAILURE(errorCode)) { return NO_LINE; }
105 // Select the next available line buffer.
106 while(!isLineBufferAvailable(lineIndex)) {
107 ++lineIndex;
108 if (lineIndex == kNumLineBuffers) {
109 lineIndex = 0;
110 }
111 }
112 char *line=lines[lineIndex];
113 *line=0;
114 lineLimit=fieldLimit=line;
115 lineType=NO_LINE;
116 char *result=fgets(line, sizeof(lines[0]), file);
117 if(result==NULL) {
118 if(ferror(file)) {
119 perror("error reading preparsed UCD");
120 fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
121 errorCode=U_FILE_ACCESS_ERROR;
122 }
123 return NO_LINE;
124 }
125 ++lineNumber;
126 if(*line=='#') {
127 fieldLimit=strchr(line, 0);
128 return lineType=EMPTY_LINE;
129 }
130 // Remove trailing /r/n.
131 char c;
132 char *limit=strchr(line, 0);
133 while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
134 // Remove trailing white space.
135 while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
136 *limit=0;
137 lineLimit=limit;
138 if(line==limit) {
139 fieldLimit=limit;
140 return lineType=EMPTY_LINE;
141 }
142 // Split by ';'.
143 char *semi=line;
144 while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
145 fieldLimit=strchr(line, 0);
146 // Determine the line type.
147 int32_t type;
148 for(type=EMPTY_LINE+1;; ++type) {
149 if(type==LINE_TYPE_COUNT) {
150 fprintf(stderr,
151 "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
152 line, (long)lineNumber);
153 errorCode=U_PARSE_ERROR;
154 return NO_LINE;
155 }
156 if(0==strcmp(line, lineTypeStrings[type])) {
157 break;
158 }
159 }
160 lineType=(LineType)type;
161 if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
162 u_versionFromString(ucdVersion, fieldLimit+1);
163 }
164 return lineType;
165 }
166
167 const char *
firstField()168 PreparsedUCD::firstField() {
169 char *field=lines[lineIndex];
170 fieldLimit=strchr(field, 0);
171 return field;
172 }
173
174 const char *
nextField()175 PreparsedUCD::nextField() {
176 if(fieldLimit==lineLimit) { return NULL; }
177 char *field=fieldLimit+1;
178 fieldLimit=strchr(field, 0);
179 return field;
180 }
181
182 const UniProps *
getProps(UnicodeSet & newValues,UErrorCode & errorCode)183 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
184 if(U_FAILURE(errorCode)) { return NULL; }
185 newValues.clear();
186 if(!lineHasPropertyValues()) {
187 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
188 return NULL;
189 }
190 firstField();
191 const char *field=nextField();
192 if(field==NULL) {
193 // No range field after the type.
194 fprintf(stderr,
195 "error in preparsed UCD: missing default/block/cp range field "
196 "(no second field) on line %ld\n",
197 (long)lineNumber);
198 errorCode=U_PARSE_ERROR;
199 return NULL;
200 }
201 UChar32 start, end;
202 if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
203 UniProps *props;
204 switch(lineType) {
205 case DEFAULTS_LINE:
206 if(defaultLineIndex>=0) {
207 fprintf(stderr,
208 "error in preparsed UCD: second line with default properties on line %ld\n",
209 (long)lineNumber);
210 errorCode=U_PARSE_ERROR;
211 return NULL;
212 }
213 if(start!=0 || end!=0x10ffff) {
214 fprintf(stderr,
215 "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
216 field, (long)lineNumber);
217 errorCode=U_PARSE_ERROR;
218 return NULL;
219 }
220 props=&defaultProps;
221 defaultLineIndex=lineIndex;
222 break;
223 case BLOCK_LINE:
224 blockProps=defaultProps; // Block inherits default properties.
225 props=&blockProps;
226 blockLineIndex=lineIndex;
227 break;
228 case CP_LINE:
229 if(blockProps.start<=start && end<=blockProps.end) {
230 // Code point range fully inside the last block inherits the block properties.
231 cpProps=blockProps;
232 } else if(start>blockProps.end || end<blockProps.start) {
233 // Code point range fully outside the last block inherits the default properties.
234 cpProps=defaultProps;
235 } else {
236 // Code point range partially overlapping with the last block is illegal.
237 fprintf(stderr,
238 "error in preparsed UCD: cp range %s on line %ld only "
239 "partially overlaps with block range %04lX..%04lX\n",
240 field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
241 errorCode=U_PARSE_ERROR;
242 return NULL;
243 }
244 props=&cpProps;
245 break;
246 default:
247 // Will not occur because of the range check above.
248 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
249 return NULL;
250 }
251 props->start=start;
252 props->end=end;
253 while((field=nextField())!=NULL) {
254 if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
255 }
256 return props;
257 }
258
259 static const struct {
260 const char *name;
261 int32_t prop;
262 } ppucdProperties[]={
263 { "Name_Alias", PPUCD_NAME_ALIAS },
264 { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
265 { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
266 };
267
268 // Returns TRUE for "ok to continue parsing fields".
269 UBool
parseProperty(UniProps & props,const char * field,UnicodeSet & newValues,UErrorCode & errorCode)270 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
271 UErrorCode &errorCode) {
272 CharString pBuffer;
273 const char *p=field;
274 const char *v=strchr(p, '=');
275 int binaryValue;
276 if(*p=='-') {
277 if(v!=NULL) {
278 fprintf(stderr,
279 "error in preparsed UCD: mix of binary-property-no and "
280 "enum-property syntax '%s' on line %ld\n",
281 field, (long)lineNumber);
282 errorCode=U_PARSE_ERROR;
283 return FALSE;
284 }
285 binaryValue=0;
286 ++p;
287 } else if(v==NULL) {
288 binaryValue=1;
289 } else {
290 binaryValue=-1;
291 // Copy out the property name rather than modifying the field (writing a NUL).
292 pBuffer.append(p, (int32_t)(v-p), errorCode);
293 p=pBuffer.data();
294 ++v;
295 }
296 int32_t prop=pnames->getPropertyEnum(p);
297 if(prop<0) {
298 for(int32_t i=0;; ++i) {
299 if(i==UPRV_LENGTHOF(ppucdProperties)) {
300 // Ignore unknown property names.
301 return TRUE;
302 }
303 if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
304 prop=ppucdProperties[i].prop;
305 U_ASSERT(prop>=0);
306 break;
307 }
308 }
309 }
310 if(prop<UCHAR_BINARY_LIMIT) {
311 if(binaryValue>=0) {
312 props.binProps[prop]=(UBool)binaryValue;
313 } else {
314 // No binary value for a binary property.
315 fprintf(stderr,
316 "error in preparsed UCD: enum-property syntax '%s' "
317 "for binary property on line %ld\n",
318 field, (long)lineNumber);
319 errorCode=U_PARSE_ERROR;
320 }
321 } else if(binaryValue>=0) {
322 // Binary value for a non-binary property.
323 fprintf(stderr,
324 "error in preparsed UCD: binary-property syntax '%s' "
325 "for non-binary property on line %ld\n",
326 field, (long)lineNumber);
327 errorCode=U_PARSE_ERROR;
328 } else if (prop < UCHAR_INT_START) {
329 fprintf(stderr,
330 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
331 prop, (long)lineNumber);
332 errorCode=U_PARSE_ERROR;
333 } else if(prop<UCHAR_INT_LIMIT) {
334 int32_t value=pnames->getPropertyValueEnum(prop, v);
335 if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
336 // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
337 char *end;
338 unsigned long ccc=uprv_strtoul(v, &end, 10);
339 if(v<end && *end==0 && ccc<=254) {
340 value=(int32_t)ccc;
341 }
342 }
343 if(value==UCHAR_INVALID_CODE) {
344 fprintf(stderr,
345 "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
346 field, (long)lineNumber);
347 errorCode=U_PARSE_ERROR;
348 } else {
349 props.intProps[prop-UCHAR_INT_START]=value;
350 }
351 } else if(*v=='<') {
352 // Do not parse default values like <code point>, just set null values.
353 switch(prop) {
354 case UCHAR_BIDI_MIRRORING_GLYPH:
355 props.bmg=U_SENTINEL;
356 break;
357 case UCHAR_BIDI_PAIRED_BRACKET:
358 props.bpb=U_SENTINEL;
359 break;
360 case UCHAR_SIMPLE_CASE_FOLDING:
361 props.scf=U_SENTINEL;
362 break;
363 case UCHAR_SIMPLE_LOWERCASE_MAPPING:
364 props.slc=U_SENTINEL;
365 break;
366 case UCHAR_SIMPLE_TITLECASE_MAPPING:
367 props.stc=U_SENTINEL;
368 break;
369 case UCHAR_SIMPLE_UPPERCASE_MAPPING:
370 props.suc=U_SENTINEL;
371 break;
372 case UCHAR_CASE_FOLDING:
373 props.cf.remove();
374 break;
375 case UCHAR_LOWERCASE_MAPPING:
376 props.lc.remove();
377 break;
378 case UCHAR_TITLECASE_MAPPING:
379 props.tc.remove();
380 break;
381 case UCHAR_UPPERCASE_MAPPING:
382 props.uc.remove();
383 break;
384 case UCHAR_SCRIPT_EXTENSIONS:
385 props.scx.clear();
386 break;
387 default:
388 fprintf(stderr,
389 "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
390 field, (long)lineNumber);
391 errorCode=U_PARSE_ERROR;
392 }
393 } else {
394 char c;
395 switch(prop) {
396 case UCHAR_NUMERIC_VALUE:
397 props.numericValue=v;
398 c=*v;
399 if('0'<=c && c<='9' && v[1]==0) {
400 props.digitValue=c-'0';
401 } else {
402 props.digitValue=-1;
403 }
404 break;
405 case UCHAR_NAME:
406 props.name=v;
407 break;
408 case UCHAR_AGE:
409 u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric.
410 break;
411 case UCHAR_BIDI_MIRRORING_GLYPH:
412 props.bmg=parseCodePoint(v, errorCode);
413 break;
414 case UCHAR_BIDI_PAIRED_BRACKET:
415 props.bpb=parseCodePoint(v, errorCode);
416 break;
417 case UCHAR_SIMPLE_CASE_FOLDING:
418 props.scf=parseCodePoint(v, errorCode);
419 break;
420 case UCHAR_SIMPLE_LOWERCASE_MAPPING:
421 props.slc=parseCodePoint(v, errorCode);
422 break;
423 case UCHAR_SIMPLE_TITLECASE_MAPPING:
424 props.stc=parseCodePoint(v, errorCode);
425 break;
426 case UCHAR_SIMPLE_UPPERCASE_MAPPING:
427 props.suc=parseCodePoint(v, errorCode);
428 break;
429 case UCHAR_CASE_FOLDING:
430 parseString(v, props.cf, errorCode);
431 break;
432 case UCHAR_LOWERCASE_MAPPING:
433 parseString(v, props.lc, errorCode);
434 break;
435 case UCHAR_TITLECASE_MAPPING:
436 parseString(v, props.tc, errorCode);
437 break;
438 case UCHAR_UPPERCASE_MAPPING:
439 parseString(v, props.uc, errorCode);
440 break;
441 case PPUCD_NAME_ALIAS:
442 props.nameAlias=v;
443 break;
444 case PPUCD_CONDITIONAL_CASE_MAPPINGS:
445 case PPUCD_TURKIC_CASE_FOLDING:
446 // No need to parse their values: They are hardcoded in the runtime library.
447 break;
448 case UCHAR_SCRIPT_EXTENSIONS:
449 parseScriptExtensions(v, props.scx, errorCode);
450 break;
451 default:
452 // Ignore unhandled properties.
453 return TRUE;
454 }
455 }
456 if(U_SUCCESS(errorCode)) {
457 newValues.add((UChar32)prop);
458 return TRUE;
459 } else {
460 return FALSE;
461 }
462 }
463
464 UBool
getRangeForAlgNames(UChar32 & start,UChar32 & end,UErrorCode & errorCode)465 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
466 if(U_FAILURE(errorCode)) { return FALSE; }
467 if(lineType!=ALG_NAMES_RANGE_LINE) {
468 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
469 return FALSE;
470 }
471 firstField();
472 const char *field=nextField();
473 if(field==NULL) {
474 // No range field after the type.
475 fprintf(stderr,
476 "error in preparsed UCD: missing algnamesrange range field "
477 "(no second field) on line %ld\n",
478 (long)lineNumber);
479 errorCode=U_PARSE_ERROR;
480 return FALSE;
481 }
482 return parseCodePointRange(field, start, end, errorCode);
483 }
484
485 UChar32
parseCodePoint(const char * s,UErrorCode & errorCode)486 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
487 char *end;
488 uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
489 if(end<=s || *end!=0 || value>=0x110000) {
490 fprintf(stderr,
491 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
492 s, (long)lineNumber);
493 errorCode=U_PARSE_ERROR;
494 return U_SENTINEL;
495 }
496 return (UChar32)value;
497 }
498
499 UBool
parseCodePointRange(const char * s,UChar32 & start,UChar32 & end,UErrorCode & errorCode)500 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
501 uint32_t st, e;
502 u_parseCodePointRange(s, &st, &e, &errorCode);
503 if(U_FAILURE(errorCode)) {
504 fprintf(stderr,
505 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
506 s, (long)lineNumber);
507 return FALSE;
508 }
509 start=(UChar32)st;
510 end=(UChar32)e;
511 return TRUE;
512 }
513
514 void
parseString(const char * s,UnicodeString & uni,UErrorCode & errorCode)515 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
516 UChar *buffer=uni.getBuffer(-1);
517 int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
518 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
519 errorCode=U_ZERO_ERROR;
520 uni.releaseBuffer(0);
521 buffer=uni.getBuffer(length);
522 length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
523 }
524 uni.releaseBuffer(length);
525 if(U_FAILURE(errorCode)) {
526 fprintf(stderr,
527 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
528 s, (long)lineNumber);
529 }
530 }
531
532 void
parseScriptExtensions(const char * s,UnicodeSet & scx,UErrorCode & errorCode)533 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
534 if(U_FAILURE(errorCode)) { return; }
535 scx.clear();
536 CharString scString;
537 for(;;) {
538 const char *scs;
539 const char *scLimit=strchr(s, ' ');
540 if(scLimit!=NULL) {
541 scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
542 if(U_FAILURE(errorCode)) { return; }
543 } else {
544 scs=s;
545 }
546 int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
547 if(script==UCHAR_INVALID_CODE) {
548 fprintf(stderr,
549 "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
550 scs, (long)lineNumber);
551 errorCode=U_PARSE_ERROR;
552 return;
553 } else if(scx.contains(script)) {
554 fprintf(stderr,
555 "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
556 scs, (long)lineNumber);
557 errorCode=U_PARSE_ERROR;
558 return;
559 } else {
560 scx.add(script);
561 }
562 if(scLimit!=NULL) {
563 s=scLimit+1;
564 } else {
565 break;
566 }
567 }
568 if(scx.isEmpty()) {
569 fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
570 errorCode=U_PARSE_ERROR;
571 }
572 }
573
574 U_NAMESPACE_END
575