1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 1999-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *   file name:  ubidiimp.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 1999aug06
16 *   created by: Markus W. Scherer, updated by Matitiahu Allouche
17 */
18 
19 #ifndef UBIDIIMP_H
20 #define UBIDIIMP_H
21 
22 #include "unicode/utypes.h"
23 #include "unicode/ubidi.h"
24 #include "unicode/uchar.h"
25 #include "ubidi_props.h"
26 
27 /* miscellaneous definitions ---------------------------------------------- */
28 
29 // ICU-20853=ICU-20935 Solaris #defines CS and ES in sys/regset.h
30 #ifdef CS
31 #   undef CS
32 #endif
33 #ifdef ES
34 #   undef ES
35 #endif
36 
37 typedef uint8_t DirProp;
38 typedef uint32_t Flags;
39 
40 /*  Comparing the description of the BiDi algorithm with this implementation
41     is easier with the same names for the BiDi types in the code as there.
42     See UCharDirection in uchar.h .
43 */
44 enum {
45     L=  U_LEFT_TO_RIGHT,                /*  0 */
46     R=  U_RIGHT_TO_LEFT,                /*  1 */
47     EN= U_EUROPEAN_NUMBER,              /*  2 */
48     ES= U_EUROPEAN_NUMBER_SEPARATOR,    /*  3 */
49     ET= U_EUROPEAN_NUMBER_TERMINATOR,   /*  4 */
50     AN= U_ARABIC_NUMBER,                /*  5 */
51     CS= U_COMMON_NUMBER_SEPARATOR,      /*  6 */
52     B=  U_BLOCK_SEPARATOR,              /*  7 */
53     S=  U_SEGMENT_SEPARATOR,            /*  8 */
54     WS= U_WHITE_SPACE_NEUTRAL,          /*  9 */
55     ON= U_OTHER_NEUTRAL,                /* 10 */
56     LRE=U_LEFT_TO_RIGHT_EMBEDDING,      /* 11 */
57     LRO=U_LEFT_TO_RIGHT_OVERRIDE,       /* 12 */
58     AL= U_RIGHT_TO_LEFT_ARABIC,         /* 13 */
59     RLE=U_RIGHT_TO_LEFT_EMBEDDING,      /* 14 */
60     RLO=U_RIGHT_TO_LEFT_OVERRIDE,       /* 15 */
61     PDF=U_POP_DIRECTIONAL_FORMAT,       /* 16 */
62     NSM=U_DIR_NON_SPACING_MARK,         /* 17 */
63     BN= U_BOUNDARY_NEUTRAL,             /* 18 */
64     FSI=U_FIRST_STRONG_ISOLATE,         /* 19 */
65     LRI=U_LEFT_TO_RIGHT_ISOLATE,        /* 20 */
66     RLI=U_RIGHT_TO_LEFT_ISOLATE,        /* 21 */
67     PDI=U_POP_DIRECTIONAL_ISOLATE,      /* 22 */
68     ENL,    /* EN after W7 */           /* 23 */
69     ENR,    /* EN not subject to W7 */  /* 24 */
70     dirPropCount
71 };
72 
73 /*  Sometimes, bit values are more appropriate
74     to deal with directionality properties.
75     Abbreviations in these macro names refer to names
76     used in the BiDi algorithm.
77 */
78 #define DIRPROP_FLAG(dir) (1UL<<(dir))
79 #define PURE_DIRPROP(prop)  ((prop)&~0xE0)    ?????????????????????????
80 
81 /* special flag for multiple runs from explicit embedding codes */
82 #define DIRPROP_FLAG_MULTI_RUNS (1UL<<31)
83 
84 /* are there any characters that are LTR or RTL? */
85 #define MASK_LTR (DIRPROP_FLAG(L)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(ENL)|DIRPROP_FLAG(ENR)|DIRPROP_FLAG(AN)|DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(LRI))
86 #define MASK_RTL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(RLI))
87 #define MASK_R_AL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL))
88 #define MASK_STRONG_EN_AN (DIRPROP_FLAG(L)|DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(AN))
89 
90 /* explicit embedding codes */
91 #define MASK_EXPLICIT (DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(PDF))
92 
93 /* explicit isolate codes */
94 #define MASK_ISO (DIRPROP_FLAG(LRI)|DIRPROP_FLAG(RLI)|DIRPROP_FLAG(FSI)|DIRPROP_FLAG(PDI))
95 
96 #define MASK_BN_EXPLICIT (DIRPROP_FLAG(BN)|MASK_EXPLICIT)
97 
98 /* paragraph and segment separators */
99 #define MASK_B_S (DIRPROP_FLAG(B)|DIRPROP_FLAG(S))
100 
101 /* all types that are counted as White Space or Neutral in some steps */
102 #define MASK_WS (MASK_B_S|DIRPROP_FLAG(WS)|MASK_BN_EXPLICIT|MASK_ISO)
103 
104 /* types that are neutrals or could becomes neutrals in (Wn) */
105 #define MASK_POSSIBLE_N (DIRPROP_FLAG(ON)|DIRPROP_FLAG(CS)|DIRPROP_FLAG(ES)|DIRPROP_FLAG(ET)|MASK_WS)
106 
107 /*
108  *  These types may be changed to "e",
109  *  the embedding type (L or R) of the run,
110  *  in the BiDi algorithm (N2)
111  */
112 #define MASK_EMBEDDING (DIRPROP_FLAG(NSM)|MASK_POSSIBLE_N)
113 
114 /* the dirProp's L and R are defined to 0 and 1 values in UCharDirection */
115 #define GET_LR_FROM_LEVEL(level) ((DirProp)((level)&1))
116 
117 #define IS_DEFAULT_LEVEL(level) ((level)>=0xfe)
118 
119 /*
120  *  The following bit is used for the directional isolate status.
121  *  Stack entries corresponding to isolate sequences are greater than ISOLATE.
122  */
123 #define ISOLATE  0x0100
124 
125 U_CFUNC UBiDiLevel
126 ubidi_getParaLevelAtIndex(const UBiDi *pBiDi, int32_t index);
127 
128 #define GET_PARALEVEL(ubidi, index) \
129             ((UBiDiLevel)(!(ubidi)->defaultParaLevel || (index)<(ubidi)->paras[0].limit ? \
130                          (ubidi)->paraLevel : ubidi_getParaLevelAtIndex((ubidi), (index))))
131 
132 /* number of paras entries allocated initially without malloc */
133 #define SIMPLE_PARAS_COUNT      10
134 /* number of isolate entries allocated initially without malloc */
135 #define SIMPLE_ISOLATES_COUNT   5
136 /* number of isolate run entries for paired brackets allocated initially without malloc */
137 #define SIMPLE_OPENINGS_COUNT   20
138 
139 #define CR  0x000D
140 #define LF  0x000A
141 
142 /* Run structure for reordering --------------------------------------------- */
143 enum {
144     LRM_BEFORE=1,
145     LRM_AFTER=2,
146     RLM_BEFORE=4,
147     RLM_AFTER=8
148 };
149 
150 typedef struct Para {
151     int32_t limit;
152     int32_t level;
153 } Para;
154 
155 enum {                                  /* flags for Opening.flags */
156     FOUND_L=DIRPROP_FLAG(L),
157     FOUND_R=DIRPROP_FLAG(R)
158 };
159 
160 typedef struct Opening {
161     int32_t position;                   /* position of opening bracket */
162     int32_t match;                      /* matching char or -position of closing bracket */
163     int32_t contextPos;                 /* position of last strong char found before opening */
164     uint16_t flags;                     /* bits for L or R/AL found within the pair */
165     UBiDiDirection contextDir;          /* L or R according to last strong char before opening */
166     uint8_t filler;                     /* to complete a nice multiple of 4 chars */
167 } Opening;
168 
169 typedef struct IsoRun {
170     int32_t  contextPos;                /* position of char determining context */
171     uint16_t start;                     /* index of first opening entry for this run */
172     uint16_t limit;                     /* index after last opening entry for this run */
173     UBiDiLevel level;                   /* level of this run */
174     DirProp lastStrong;                 /* bidi class of last strong char found in this run */
175     DirProp lastBase;                   /* bidi class of last base char found in this run */
176     UBiDiDirection contextDir;          /* L or R to use as context for following openings */
177 } IsoRun;
178 
179 typedef struct BracketData {
180     UBiDi   *pBiDi;
181     /* array of opening entries which should be enough in most cases; no malloc() */
182     Opening simpleOpenings[SIMPLE_OPENINGS_COUNT];
183     Opening *openings;                  /* pointer to current array of entries */
184     int32_t openingsCount;              /* number of allocated entries */
185     int32_t isoRunLast;                 /* index of last used entry */
186     /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL
187        + 1 for index 0, + 1 for before the first isolated sequence */
188     IsoRun  isoRuns[UBIDI_MAX_EXPLICIT_LEVEL+2];
189     UBool isNumbersSpecial;             /* reordering mode for NUMBERS_SPECIAL */
190 } BracketData;
191 
192 typedef struct Isolate {
193     int32_t startON;
194     int32_t start1;
195     int32_t state;
196     int16_t stateImp;
197 } Isolate;
198 
199 typedef struct Run {
200     int32_t logicalStart,   /* first character of the run; b31 indicates even/odd level */
201             visualLimit,    /* last visual position of the run +1 */
202             insertRemove;   /* if >0, flags for inserting LRM/RLM before/after run,
203                                if <0, count of bidi controls within run            */
204 } Run;
205 
206 /* in a Run, logicalStart will get this bit set if the run level is odd */
207 #define INDEX_ODD_BIT (1UL<<31)
208 
209 #define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)((level)&1)<<31))
210 #define ADD_ODD_BIT_FROM_LEVEL(x, level)  ((x)|=((int32_t)((level)&1)<<31))
211 #define REMOVE_ODD_BIT(x)                 ((x)&=~INDEX_ODD_BIT)
212 
213 #define GET_INDEX(x)   ((x)&~INDEX_ODD_BIT)
214 #define GET_ODD_BIT(x) ((uint32_t)(x)>>31)
215 #define IS_ODD_RUN(x)  ((UBool)(((x)&INDEX_ODD_BIT)!=0))
216 #define IS_EVEN_RUN(x) ((UBool)(((x)&INDEX_ODD_BIT)==0))
217 
218 U_CFUNC UBool
219 ubidi_getRuns(UBiDi *pBiDi, UErrorCode *pErrorCode);
220 
221 /** BiDi control code points */
222 enum {
223     ZWNJ_CHAR=0x200c,
224     ZWJ_CHAR,
225     LRM_CHAR,
226     RLM_CHAR,
227     LRE_CHAR=0x202a,
228     RLE_CHAR,
229     PDF_CHAR,
230     LRO_CHAR,
231     RLO_CHAR,
232     LRI_CHAR=0x2066,
233     RLI_CHAR,
234     FSI_CHAR,
235     PDI_CHAR
236 };
237 
238 #define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffc)==ZWNJ_CHAR || (uint32_t)((c)-LRE_CHAR)<5 || (uint32_t)((c)-LRI_CHAR)<4)
239 
240 /* InsertPoints structure for noting where to put BiDi marks ---------------- */
241 
242 typedef struct Point {
243     int32_t pos;            /* position in text */
244     int32_t flag;           /* flag for LRM/RLM, before/after */
245 } Point;
246 
247 typedef struct InsertPoints {
248     int32_t capacity;       /* number of points allocated */
249     int32_t size;           /* number of points used */
250     int32_t confirmed;      /* number of points confirmed */
251     UErrorCode errorCode;   /* for eventual memory shortage */
252     Point *points;          /* pointer to array of points */
253 } InsertPoints;
254 
255 
256 /* UBiDi structure ----------------------------------------------------------- */
257 
258 struct UBiDi {
259     /* pointer to parent paragraph object (pointer to self if this object is
260      * a paragraph object); set to NULL in a newly opened object; set to a
261      * real value after a successful execution of ubidi_setPara or ubidi_setLine
262      */
263     const UBiDi * pParaBiDi;
264 
265     /* alias pointer to the current text */
266     const UChar *text;
267 
268     /* length of the current text */
269     int32_t originalLength;
270 
271     /* if the UBIDI_OPTION_STREAMING option is set, this is the length
272      * of text actually processed by ubidi_setPara, which may be shorter than
273      * the original length.
274      * Otherwise, it is identical to the original length.
275      */
276     int32_t length;
277 
278     /* if the UBIDI_OPTION_REMOVE_CONTROLS option is set, and/or
279      * marks are allowed to be inserted in one of the reordering mode, the
280      * length of the result string may be different from the processed length.
281      */
282     int32_t resultLength;
283 
284     /* memory sizes in bytes */
285     int32_t dirPropsSize, levelsSize, openingsSize, parasSize, runsSize, isolatesSize;
286 
287     /* allocated memory */
288     DirProp *dirPropsMemory;
289     UBiDiLevel *levelsMemory;
290     Opening *openingsMemory;
291     Para *parasMemory;
292     Run *runsMemory;
293     Isolate *isolatesMemory;
294 
295     /* indicators for whether memory may be allocated after ubidi_open() */
296     UBool mayAllocateText, mayAllocateRuns;
297 
298     /* arrays with one value per text-character */
299     DirProp *dirProps;
300     UBiDiLevel *levels;
301 
302     /* are we performing an approximation of the "inverse BiDi" algorithm? */
303     UBool isInverse;
304 
305     /* are we using the basic algorithm or its variation? */
306     UBiDiReorderingMode reorderingMode;
307 
308     /* UBIDI_REORDER_xxx values must be ordered so that all the regular
309      * logical to visual modes come first, and all inverse BiDi modes
310      * come last.
311      */
312     #define UBIDI_REORDER_LAST_LOGICAL_TO_VISUAL    UBIDI_REORDER_NUMBERS_SPECIAL
313 
314     /* bitmask for reordering options */
315     uint32_t reorderingOptions;
316 
317     /* must block separators receive level 0? */
318     UBool orderParagraphsLTR;
319 
320     /* the paragraph level */
321     UBiDiLevel paraLevel;
322     /* original paraLevel when contextual */
323     /* must be one of UBIDI_DEFAULT_xxx or 0 if not contextual */
324     UBiDiLevel defaultParaLevel;
325 
326     /* context data */
327     const UChar *prologue;
328     int32_t proLength;
329     const UChar *epilogue;
330     int32_t epiLength;
331 
332     /* the following is set in ubidi_setPara, used in processPropertySeq */
333     const struct ImpTabPair * pImpTabPair;  /* pointer to levels state table pair */
334 
335     /* the overall paragraph or line directionality - see UBiDiDirection */
336     UBiDiDirection direction;
337 
338     /* flags is a bit set for which directional properties are in the text */
339     Flags flags;
340 
341     /* lastArabicPos is index to the last AL in the text, -1 if none */
342     int32_t lastArabicPos;
343 
344     /* characters after trailingWSStart are WS and are */
345     /* implicitly at the paraLevel (rule (L1)) - levels may not reflect that */
346     int32_t trailingWSStart;
347 
348     /* fields for paragraph handling */
349     int32_t paraCount;                  /* set in getDirProps() */
350     /* filled in getDirProps() */
351     Para *paras;
352 
353     /* for relatively short text, we only need a tiny array of paras (no malloc()) */
354     Para simpleParas[SIMPLE_PARAS_COUNT];
355 
356     /* fields for line reordering */
357     int32_t runCount;     /* ==-1: runs not set up yet */
358     Run *runs;
359 
360     /* for non-mixed text, we only need a tiny array of runs (no malloc()) */
361     Run simpleRuns[1];
362 
363     /* maximum or current nesting depth of isolate sequences */
364     /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal
365        nesting encountered.
366        Within resolveImplicitLevels(), this is the index of the current isolates
367        stack entry. */
368     int32_t isolateCount;
369     Isolate *isolates;
370 
371     /* for simple text, have a small stack (no malloc()) */
372     Isolate simpleIsolates[SIMPLE_ISOLATES_COUNT];
373 
374     /* for inverse Bidi with insertion of directional marks */
375     InsertPoints insertPoints;
376 
377     /* for option UBIDI_OPTION_REMOVE_CONTROLS */
378     int32_t controlCount;
379 
380     /* for Bidi class callback */
381     UBiDiClassCallback *fnClassCallback;    /* action pointer */
382     const void *coClassCallback;            /* context pointer */
383 };
384 
385 #define IS_VALID_PARA(x) ((x) && ((x)->pParaBiDi==(x)))
386 #define IS_VALID_PARA_OR_LINE(x) ((x) && ((x)->pParaBiDi==(x) || (((x)->pParaBiDi) && (x)->pParaBiDi->pParaBiDi==(x)->pParaBiDi)))
387 
388 typedef union {
389     DirProp *dirPropsMemory;
390     UBiDiLevel *levelsMemory;
391     Opening *openingsMemory;
392     Para *parasMemory;
393     Run *runsMemory;
394     Isolate *isolatesMemory;
395 } BidiMemoryForAllocation;
396 
397 /* Macros for initial checks at function entry */
398 #define RETURN_IF_NULL_OR_FAILING_ERRCODE(pErrcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \
399     if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return retvalue; \
400 } UPRV_BLOCK_MACRO_END
401 #define RETURN_IF_NOT_VALID_PARA(bidi, errcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \
402     if(!IS_VALID_PARA(bidi)) { \
403         errcode=U_INVALID_STATE_ERROR; \
404         return retvalue; \
405     } \
406 } UPRV_BLOCK_MACRO_END
407 #define RETURN_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \
408     if(!IS_VALID_PARA_OR_LINE(bidi)) { \
409         errcode=U_INVALID_STATE_ERROR; \
410         return retvalue; \
411     } \
412 } UPRV_BLOCK_MACRO_END
413 #define RETURN_IF_BAD_RANGE(arg, start, limit, errcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \
414     if((arg)<(start) || (arg)>=(limit)) { \
415         (errcode)=U_ILLEGAL_ARGUMENT_ERROR; \
416         return retvalue; \
417     } \
418 } UPRV_BLOCK_MACRO_END
419 
420 #define RETURN_VOID_IF_NULL_OR_FAILING_ERRCODE(pErrcode) UPRV_BLOCK_MACRO_BEGIN { \
421     if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return; \
422 } UPRV_BLOCK_MACRO_END
423 #define RETURN_VOID_IF_NOT_VALID_PARA(bidi, errcode) UPRV_BLOCK_MACRO_BEGIN { \
424     if(!IS_VALID_PARA(bidi)) { \
425         errcode=U_INVALID_STATE_ERROR; \
426         return; \
427     } \
428 } UPRV_BLOCK_MACRO_END
429 #define RETURN_VOID_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode) UPRV_BLOCK_MACRO_BEGIN { \
430     if(!IS_VALID_PARA_OR_LINE(bidi)) { \
431         errcode=U_INVALID_STATE_ERROR; \
432         return; \
433     } \
434 } UPRV_BLOCK_MACRO_END
435 #define RETURN_VOID_IF_BAD_RANGE(arg, start, limit, errcode) UPRV_BLOCK_MACRO_BEGIN { \
436     if((arg)<(start) || (arg)>=(limit)) { \
437         (errcode)=U_ILLEGAL_ARGUMENT_ERROR; \
438         return; \
439     } \
440 } UPRV_BLOCK_MACRO_END
441 
442 /* helper function to (re)allocate memory if allowed */
443 U_CFUNC UBool
444 ubidi_getMemory(BidiMemoryForAllocation *pMemory, int32_t *pSize, UBool mayAllocate, int32_t sizeNeeded);
445 
446 /* helper macros for each allocated array in UBiDi */
447 #define getDirPropsMemory(pBiDi, length) \
448         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \
449                         (pBiDi)->mayAllocateText, (length))
450 
451 #define getLevelsMemory(pBiDi, length) \
452         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \
453                         (pBiDi)->mayAllocateText, (length))
454 
455 #define getRunsMemory(pBiDi, length) \
456         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \
457                         (pBiDi)->mayAllocateRuns, (length)*sizeof(Run))
458 
459 /* additional macros used by ubidi_open() - always allow allocation */
460 #define getInitialDirPropsMemory(pBiDi, length) \
461         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \
462                         true, (length))
463 
464 #define getInitialLevelsMemory(pBiDi, length) \
465         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \
466                         true, (length))
467 
468 #define getInitialOpeningsMemory(pBiDi, length) \
469         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->openingsMemory, &(pBiDi)->openingsSize, \
470                         true, (length)*sizeof(Opening))
471 
472 #define getInitialParasMemory(pBiDi, length) \
473         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->parasMemory, &(pBiDi)->parasSize, \
474                         true, (length)*sizeof(Para))
475 
476 #define getInitialRunsMemory(pBiDi, length) \
477         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \
478                         true, (length)*sizeof(Run))
479 
480 #define getInitialIsolatesMemory(pBiDi, length) \
481         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->isolatesMemory, &(pBiDi)->isolatesSize, \
482                         true, (length)*sizeof(Isolate))
483 
484 #endif
485