1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2000-2010, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  uparse.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2000apr18
16 *   created by: Markus W. Scherer
17 *
18 *   This file provides a parser for files that are delimited by one single
19 *   character like ';' or TAB. Example: the Unicode Character Properties files
20 *   like UnicodeData.txt are semicolon-delimited.
21 */
22 
23 #ifndef __UPARSE_H__
24 #define __UPARSE_H__
25 
26 #include "unicode/utypes.h"
27 
28 /**
29  * Is c an invariant-character whitespace?
30  * @param c invariant character
31  */
32 #define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n')
33 
34 U_CDECL_BEGIN
35 
36 /**
37  * Skip space ' ' and TAB '\t' characters.
38  *
39  * @param s Pointer to characters.
40  * @return Pointer to first character at or after s that is not a space or TAB.
41  */
42 U_CAPI const char * U_EXPORT2
43 u_skipWhitespace(const char *s);
44 
45 /**
46  * Trim whitespace (including line endings) from the end of the string.
47  *
48  * @param s Pointer to the string.
49  * @return Pointer to the new end of the string.
50  */
51 U_CAPI char * U_EXPORT2
52 u_rtrim(char *s);
53 
54 /** Function type for u_parseDelimitedFile(). */
55 typedef void U_CALLCONV
56 UParseLineFn(void *context,
57               char *fields[][2],
58               int32_t fieldCount,
59               UErrorCode *pErrorCode);
60 
61 /**
62  * Parser for files that are similar to UnicodeData.txt:
63  * This function opens the file and reads it line by line. It skips empty lines
64  * and comment lines that start with a '#'.
65  * All other lines are separated into fields with one delimiter character
66  * (semicolon for Unicode Properties files) between two fields. The last field in
67  * a line does not need to be terminated with a delimiter.
68  *
69  * For each line, after segmenting it, a line function is called.
70  * It gets passed the array of field start and limit pointers that is
71  * passed into this parser and filled by it for each line.
72  * For each field i of the line, the start pointer in fields[i][0]
73  * points to the beginning of the field, while the limit pointer in fields[i][1]
74  * points behind the field, i.e., to the delimiter or the line end.
75  *
76  * The context parameter of the line function is
77  * the same as the one for the parse function.
78  *
79  * The line function may modify the contents of the fields including the
80  * limit characters.
81  *
82  * If the file cannot be opened, or there is a parsing error or a field function
83  * sets *pErrorCode, then the parser returns with *pErrorCode set to an error code.
84  */
85 U_CAPI void U_EXPORT2
86 u_parseDelimitedFile(const char *filename, char delimiter,
87                      char *fields[][2], int32_t fieldCount,
88                      UParseLineFn *lineFn, void *context,
89                      UErrorCode *pErrorCode);
90 
91 /**
92  * Parse a string of code points like 0061 0308 0300.
93  * s must end with either ';' or NUL.
94  *
95  * @return Number of code points.
96  */
97 U_CAPI int32_t U_EXPORT2
98 u_parseCodePoints(const char *s,
99                   uint32_t *dest, int32_t destCapacity,
100                   UErrorCode *pErrorCode);
101 
102 /**
103  * Parse a list of code points like 0061 0308 0300
104  * into a UChar * string.
105  * s must end with either ';' or NUL.
106  *
107  * Set the first code point in *pFirst.
108  *
109  * @param s Input char * string.
110  * @param dest Output string buffer.
111  * @param destCapacity Capacity of dest in numbers of UChars.
112  * @param pFirst If pFirst!=NULL the *pFirst will be set to the first
113  *               code point in the string.
114  * @param pErrorCode ICU error code.
115  * @return The length of the string in numbers of UChars.
116  */
117 U_CAPI int32_t U_EXPORT2
118 u_parseString(const char *s,
119               UChar *dest, int32_t destCapacity,
120               uint32_t *pFirst,
121               UErrorCode *pErrorCode);
122 
123 /**
124  * Parse a code point range like
125  * 0085 or
126  * 4E00..9FA5.
127  *
128  * s must contain such a range and end with either ';' or NUL.
129  *
130  * @return Length of code point range, end-start+1
131  */
132 U_CAPI int32_t U_EXPORT2
133 u_parseCodePointRange(const char *s,
134                       uint32_t *pStart, uint32_t *pEnd,
135                       UErrorCode *pErrorCode);
136 
137 /**
138  * Same as u_parseCodePointRange() but the range may be terminated by
139  * any character. The position of the terminating character is returned via
140  * the *terminator output parameter.
141  */
142 U_CAPI int32_t U_EXPORT2
143 u_parseCodePointRangeAnyTerminator(const char *s,
144                                    uint32_t *pStart, uint32_t *pEnd,
145                                    const char **terminator,
146                                    UErrorCode *pErrorCode);
147 
148 U_CAPI int32_t U_EXPORT2
149 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status);
150 
151 U_CDECL_END
152 
153 #endif
154