1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2011-2013, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  ppucd.h
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2011dec11
14 *   created by: Markus W. Scherer
15 */
16 
17 #ifndef __PPUCD_H__
18 #define __PPUCD_H__
19 
20 #include "unicode/utypes.h"
21 #include "unicode/uniset.h"
22 #include "unicode/unistr.h"
23 
24 #include <stdio.h>
25 
26 /** Additions to the uchar.h enum UProperty. */
27 enum {
28     /** Name_Alias */
29     PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
30     PPUCD_CONDITIONAL_CASE_MAPPINGS,
31     PPUCD_TURKIC_CASE_FOLDING
32 };
33 
34 U_NAMESPACE_BEGIN
35 
36 class U_TOOLUTIL_API PropertyNames {
37 public:
38     virtual ~PropertyNames();
39     virtual int32_t getPropertyEnum(const char *name) const;
40     virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const;
41 };
42 
43 struct U_TOOLUTIL_API UniProps {
44     UniProps();
45     ~UniProps();
46 
getIntPropUniProps47     int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }
48 
49     UChar32 start, end;
50     UBool binProps[UCHAR_BINARY_LIMIT];
51     int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
52     UVersionInfo age;
53     UChar32 bmg, bpb;
54     UChar32 scf, slc, stc, suc;
55     int32_t digitValue;
56     const char *numericValue;
57     const char *name;
58     const char *nameAlias;
59     UnicodeString cf, lc, tc, uc;
60     UnicodeSet scx;
61 };
62 
63 class U_TOOLUTIL_API PreparsedUCD {
64 public:
65     enum LineType {
66         /** No line, end of file. */
67         NO_LINE,
68         /** Empty line. (Might contain a comment.) */
69         EMPTY_LINE,
70 
71         /** ucd;6.1.0 */
72         UNICODE_VERSION_LINE,
73 
74         /** property;Binary;Alpha;Alphabetic */
75         PROPERTY_LINE,
76         /** binary;N;No;F;False */
77         BINARY_LINE,
78         /** value;gc;Zs;Space_Separator */
79         VALUE_LINE,
80 
81         /** defaults;0000..10FFFF;age=NA;bc=L;... */
82         DEFAULTS_LINE,
83         /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
84         BLOCK_LINE,
85         /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
86         CP_LINE,
87         /** unassigned;E01F0..E0FFF;bc=BN;CWKCF;DI;GCB=CN;NFKC_CF= */
88         UNASSIGNED_LINE,
89 
90         /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
91         ALG_NAMES_RANGE_LINE,
92 
93         LINE_TYPE_COUNT
94     };
95 
96     /**
97      * Constructor.
98      * Prepare this object for a new, empty package.
99      */
100     PreparsedUCD(const char *filename, UErrorCode &errorCode);
101 
102     /** Destructor. */
103     ~PreparsedUCD();
104 
105     /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */
setPropertyNames(const PropertyNames * pn)106     void setPropertyNames(const PropertyNames *pn) { pnames=pn; }
107 
108     /**
109      * Reads a line from the preparsed UCD file.
110      * Splits the line by replacing each ';' with a NUL.
111      */
112     LineType readLine(UErrorCode &errorCode);
113 
114     /** Returns the number of the line read by readLine(). */
getLineNumber()115     int32_t getLineNumber() const { return lineNumber; }
116 
117     /** Returns the line's next field, or NULL. */
118     const char *nextField();
119 
120     /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
getUnicodeVersion()121     const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }
122 
123     /** Returns TRUE if the current line has property values. */
lineHasPropertyValues()124     UBool lineHasPropertyValues() const {
125         return DEFAULTS_LINE<=lineType && lineType<=UNASSIGNED_LINE;
126     }
127 
128     /**
129      * Parses properties from the current line.
130      * Clears newValues and sets UProperty codes for property values mentioned
131      * on the current line (as opposed to being inherited).
132      * Returns a pointer to the filled-in UniProps, or NULL if something went wrong.
133      * The returned UniProps are usable until the next line of the same type is read.
134      */
135     const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);
136 
137     /**
138      * Returns the code point range for the current algnamesrange line.
139      * Calls & parses nextField().
140      * Further nextField() calls will yield the range's type & prefix string.
141      * Returns U_SUCCESS(errorCode).
142      */
143     UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);
144 
145 private:
isLineBufferAvailable(int32_t i)146     UBool isLineBufferAvailable(int32_t i) {
147         return defaultLineIndex!=i && blockLineIndex!=i;
148     }
149 
150     /** Resets the field iterator and returns the line's first field (the line type field). */
151     const char *firstField();
152 
153     UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
154                         UErrorCode &errorCode);
155     UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
156     UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
157     void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
158     void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
159 
160     static const int32_t kNumLineBuffers=3;
161 
162     PropertyNames *icuPnames;  // owned
163     const PropertyNames *pnames;  // aliased
164     FILE *file;
165     int32_t defaultLineIndex, blockLineIndex, lineIndex;
166     int32_t lineNumber;
167     LineType lineType;
168     char *fieldLimit;
169     char *lineLimit;
170 
171     UVersionInfo ucdVersion;
172     UniProps defaultProps, blockProps, cpProps;
173     UnicodeSet blockValues;
174     // Multiple lines so that default and block properties can maintain pointers
175     // into their line buffers.
176     char lines[kNumLineBuffers][4096];
177 };
178 
179 U_NAMESPACE_END
180 
181 #endif  // __PPUCD_H__
182