1 /*
2 *******************************************************************************
3 *   Copyright (C) 2011-2013, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  ppucd.h
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2011dec11
12 *   created by: Markus W. Scherer
13 */
14 
15 #ifndef __PPUCD_H__
16 #define __PPUCD_H__
17 
18 #include "unicode/utypes.h"
19 #include "unicode/uniset.h"
20 #include "unicode/unistr.h"
21 
22 #include <stdio.h>
23 
24 /** Additions to the uchar.h enum UProperty. */
25 enum {
26     /** Name_Alias */
27     PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
28     PPUCD_CONDITIONAL_CASE_MAPPINGS,
29     PPUCD_TURKIC_CASE_FOLDING
30 };
31 
32 U_NAMESPACE_BEGIN
33 
34 class U_TOOLUTIL_API PropertyNames {
35 public:
36     virtual ~PropertyNames();
37     virtual int32_t getPropertyEnum(const char *name) const;
38     virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const;
39 };
40 
41 struct U_TOOLUTIL_API UniProps {
42     UniProps();
43     ~UniProps();
44 
getIntPropUniProps45     int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }
46 
47     UChar32 start, end;
48     UBool binProps[UCHAR_BINARY_LIMIT];
49     int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
50     UVersionInfo age;
51     UChar32 bmg, bpb;
52     UChar32 scf, slc, stc, suc;
53     int32_t digitValue;
54     const char *numericValue;
55     const char *name;
56     const char *nameAlias;
57     UnicodeString cf, lc, tc, uc;
58     UnicodeSet scx;
59 };
60 
61 class U_TOOLUTIL_API PreparsedUCD {
62 public:
63     enum LineType {
64         /** No line, end of file. */
65         NO_LINE,
66         /** Empty line. (Might contain a comment.) */
67         EMPTY_LINE,
68 
69         /** ucd;6.1.0 */
70         UNICODE_VERSION_LINE,
71 
72         /** property;Binary;Alpha;Alphabetic */
73         PROPERTY_LINE,
74         /** binary;N;No;F;False */
75         BINARY_LINE,
76         /** value;gc;Zs;Space_Separator */
77         VALUE_LINE,
78 
79         /** defaults;0000..10FFFF;age=NA;bc=L;... */
80         DEFAULTS_LINE,
81         /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
82         BLOCK_LINE,
83         /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
84         CP_LINE,
85 
86         /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
87         ALG_NAMES_RANGE_LINE,
88 
89         LINE_TYPE_COUNT
90     };
91 
92     /**
93      * Constructor.
94      * Prepare this object for a new, empty package.
95      */
96     PreparsedUCD(const char *filename, UErrorCode &errorCode);
97 
98     /** Destructor. */
99     ~PreparsedUCD();
100 
101     /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */
setPropertyNames(const PropertyNames * pn)102     void setPropertyNames(const PropertyNames *pn) { pnames=pn; }
103 
104     /**
105      * Reads a line from the preparsed UCD file.
106      * Splits the line by replacing each ';' with a NUL.
107      */
108     LineType readLine(UErrorCode &errorCode);
109 
110     /** Returns the number of the line read by readLine(). */
getLineNumber()111     int32_t getLineNumber() const { return lineNumber; }
112 
113     /** Returns the line's next field, or NULL. */
114     const char *nextField();
115 
116     /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
getUnicodeVersion()117     const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }
118 
119     /** Returns TRUE if the current line has property values. */
lineHasPropertyValues()120     UBool lineHasPropertyValues() const { return DEFAULTS_LINE<=lineType && lineType<=CP_LINE; }
121 
122     /**
123      * Parses properties from the current line.
124      * Clears newValues and sets UProperty codes for property values mentioned
125      * on the current line (as opposed to being inherited).
126      * Returns a pointer to the filled-in UniProps, or NULL if something went wrong.
127      * The returned UniProps are usable until the next line of the same type is read.
128      */
129     const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);
130 
131     /**
132      * Returns the code point range for the current algnamesrange line.
133      * Calls & parses nextField().
134      * Further nextField() calls will yield the range's type & prefix string.
135      * Returns U_SUCCESS(errorCode).
136      */
137     UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);
138 
139 private:
isLineBufferAvailable(int32_t i)140     UBool isLineBufferAvailable(int32_t i) {
141         return defaultLineIndex!=i && blockLineIndex!=i;
142     }
143 
144     /** Resets the field iterator and returns the line's first field (the line type field). */
145     const char *firstField();
146 
147     UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
148                         UErrorCode &errorCode);
149     UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
150     UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
151     void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
152     void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
153 
154     static const int32_t kNumLineBuffers=3;
155 
156     PropertyNames *icuPnames;  // owned
157     const PropertyNames *pnames;  // aliased
158     FILE *file;
159     int32_t defaultLineIndex, blockLineIndex, lineIndex;
160     int32_t lineNumber;
161     LineType lineType;
162     char *fieldLimit;
163     char *lineLimit;
164 
165     UVersionInfo ucdVersion;
166     UniProps defaultProps, blockProps, cpProps;
167     // Multiple lines so that default and block properties can maintain pointers
168     // into their line buffers.
169     char lines[kNumLineBuffers][4096];
170 };
171 
172 U_NAMESPACE_END
173 
174 #endif  // __PPUCD_H__
175