1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 1999-2015, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  ubidiimp.h
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 1999aug06
14 *   created by: Markus W. Scherer, updated by Matitiahu Allouche
15 */
16 
17 #ifndef UBIDIIMP_H
18 #define UBIDIIMP_H
19 
20 #include "unicode/utypes.h"
21 #include "unicode/uchar.h"
22 #include "ubidi_props.h"
23 
24 /* miscellaneous definitions ---------------------------------------------- */
25 
26 typedef uint8_t DirProp;
27 typedef uint32_t Flags;
28 
29 /*  Comparing the description of the BiDi algorithm with this implementation
30     is easier with the same names for the BiDi types in the code as there.
31     See UCharDirection in uchar.h .
32 */
33 enum {
34     L=  U_LEFT_TO_RIGHT,                /*  0 */
35     R=  U_RIGHT_TO_LEFT,                /*  1 */
36     EN= U_EUROPEAN_NUMBER,              /*  2 */
37     ES= U_EUROPEAN_NUMBER_SEPARATOR,    /*  3 */
38     ET= U_EUROPEAN_NUMBER_TERMINATOR,   /*  4 */
39     AN= U_ARABIC_NUMBER,                /*  5 */
40     CS= U_COMMON_NUMBER_SEPARATOR,      /*  6 */
41     B=  U_BLOCK_SEPARATOR,              /*  7 */
42     S=  U_SEGMENT_SEPARATOR,            /*  8 */
43     WS= U_WHITE_SPACE_NEUTRAL,          /*  9 */
44     ON= U_OTHER_NEUTRAL,                /* 10 */
45     LRE=U_LEFT_TO_RIGHT_EMBEDDING,      /* 11 */
46     LRO=U_LEFT_TO_RIGHT_OVERRIDE,       /* 12 */
47     AL= U_RIGHT_TO_LEFT_ARABIC,         /* 13 */
48     RLE=U_RIGHT_TO_LEFT_EMBEDDING,      /* 14 */
49     RLO=U_RIGHT_TO_LEFT_OVERRIDE,       /* 15 */
50     PDF=U_POP_DIRECTIONAL_FORMAT,       /* 16 */
51     NSM=U_DIR_NON_SPACING_MARK,         /* 17 */
52     BN= U_BOUNDARY_NEUTRAL,             /* 18 */
53     FSI=U_FIRST_STRONG_ISOLATE,         /* 19 */
54     LRI=U_LEFT_TO_RIGHT_ISOLATE,        /* 20 */
55     RLI=U_RIGHT_TO_LEFT_ISOLATE,        /* 21 */
56     PDI=U_POP_DIRECTIONAL_ISOLATE,      /* 22 */
57     ENL,    /* EN after W7 */           /* 23 */
58     ENR,    /* EN not subject to W7 */  /* 24 */
59     dirPropCount
60 };
61 
62 /*  Sometimes, bit values are more appropriate
63     to deal with directionality properties.
64     Abbreviations in these macro names refer to names
65     used in the BiDi algorithm.
66 */
67 #define DIRPROP_FLAG(dir) (1UL<<(dir))
68 #define PURE_DIRPROP(prop)  ((prop)&~0xE0)    ?????????????????????????
69 
70 /* special flag for multiple runs from explicit embedding codes */
71 #define DIRPROP_FLAG_MULTI_RUNS (1UL<<31)
72 
73 /* are there any characters that are LTR or RTL? */
74 #define MASK_LTR (DIRPROP_FLAG(L)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(ENL)|DIRPROP_FLAG(ENR)|DIRPROP_FLAG(AN)|DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(LRI))
75 #define MASK_RTL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(RLI))
76 #define MASK_R_AL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL))
77 #define MASK_STRONG_EN_AN (DIRPROP_FLAG(L)|DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(AN))
78 
79 /* explicit embedding codes */
80 #define MASK_EXPLICIT (DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(PDF))
81 
82 /* explicit isolate codes */
83 #define MASK_ISO (DIRPROP_FLAG(LRI)|DIRPROP_FLAG(RLI)|DIRPROP_FLAG(FSI)|DIRPROP_FLAG(PDI))
84 
85 #define MASK_BN_EXPLICIT (DIRPROP_FLAG(BN)|MASK_EXPLICIT)
86 
87 /* paragraph and segment separators */
88 #define MASK_B_S (DIRPROP_FLAG(B)|DIRPROP_FLAG(S))
89 
90 /* all types that are counted as White Space or Neutral in some steps */
91 #define MASK_WS (MASK_B_S|DIRPROP_FLAG(WS)|MASK_BN_EXPLICIT|MASK_ISO)
92 
93 /* types that are neutrals or could becomes neutrals in (Wn) */
94 #define MASK_POSSIBLE_N (DIRPROP_FLAG(ON)|DIRPROP_FLAG(CS)|DIRPROP_FLAG(ES)|DIRPROP_FLAG(ET)|MASK_WS)
95 
96 /*
97  *  These types may be changed to "e",
98  *  the embedding type (L or R) of the run,
99  *  in the BiDi algorithm (N2)
100  */
101 #define MASK_EMBEDDING (DIRPROP_FLAG(NSM)|MASK_POSSIBLE_N)
102 
103 /* the dirProp's L and R are defined to 0 and 1 values in UCharDirection */
104 #define GET_LR_FROM_LEVEL(level) ((DirProp)((level)&1))
105 
106 #define IS_DEFAULT_LEVEL(level) ((level)>=0xfe)
107 
108 /*
109  *  The following bit is used for the directional isolate status.
110  *  Stack entries corresponding to isolate sequences are greater than ISOLATE.
111  */
112 #define ISOLATE  0x0100
113 
114 U_CFUNC UBiDiLevel
115 ubidi_getParaLevelAtIndex(const UBiDi *pBiDi, int32_t index);
116 
117 #define GET_PARALEVEL(ubidi, index) \
118             ((UBiDiLevel)(!(ubidi)->defaultParaLevel || (index)<(ubidi)->paras[0].limit ? \
119                          (ubidi)->paraLevel : ubidi_getParaLevelAtIndex((ubidi), (index))))
120 
121 /* number of paras entries allocated initially without malloc */
122 #define SIMPLE_PARAS_COUNT      10
123 /* number of isolate entries allocated initially without malloc */
124 #define SIMPLE_ISOLATES_COUNT   5
125 /* number of isolate run entries for paired brackets allocated initially without malloc */
126 #define SIMPLE_OPENINGS_COUNT   20
127 
128 #define CR  0x000D
129 #define LF  0x000A
130 
131 /* Run structure for reordering --------------------------------------------- */
132 enum {
133     LRM_BEFORE=1,
134     LRM_AFTER=2,
135     RLM_BEFORE=4,
136     RLM_AFTER=8
137 };
138 
139 typedef struct Para {
140     int32_t limit;
141     int32_t level;
142 } Para;
143 
144 enum {                                  /* flags for Opening.flags */
145     FOUND_L=DIRPROP_FLAG(L),
146     FOUND_R=DIRPROP_FLAG(R)
147 };
148 
149 typedef struct Opening {
150     int32_t position;                   /* position of opening bracket */
151     int32_t match;                      /* matching char or -position of closing bracket */
152     int32_t contextPos;                 /* position of last strong char found before opening */
153     uint16_t flags;                     /* bits for L or R/AL found within the pair */
154     UBiDiDirection contextDir;          /* L or R according to last strong char before opening */
155     uint8_t filler;                     /* to complete a nice multiple of 4 chars */
156 } Opening;
157 
158 typedef struct IsoRun {
159     int32_t  contextPos;                /* position of char determining context */
160     uint16_t start;                     /* index of first opening entry for this run */
161     uint16_t limit;                     /* index after last opening entry for this run */
162     UBiDiLevel level;                   /* level of this run */
163     DirProp lastStrong;                 /* bidi class of last strong char found in this run */
164     DirProp lastBase;                   /* bidi class of last base char found in this run */
165     UBiDiDirection contextDir;          /* L or R to use as context for following openings */
166 } IsoRun;
167 
168 typedef struct BracketData {
169     UBiDi   *pBiDi;
170     /* array of opening entries which should be enough in most cases; no malloc() */
171     Opening simpleOpenings[SIMPLE_OPENINGS_COUNT];
172     Opening *openings;                  /* pointer to current array of entries */
173     int32_t openingsCount;              /* number of allocated entries */
174     int32_t isoRunLast;                 /* index of last used entry */
175     /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL
176        + 1 for index 0, + 1 for before the first isolated sequence */
177     IsoRun  isoRuns[UBIDI_MAX_EXPLICIT_LEVEL+2];
178     UBool isNumbersSpecial;             /* reordering mode for NUMBERS_SPECIAL */
179 } BracketData;
180 
181 typedef struct Isolate {
182     int32_t startON;
183     int32_t start1;
184     int32_t state;
185     int16_t stateImp;
186 } Isolate;
187 
188 typedef struct Run {
189     int32_t logicalStart,   /* first character of the run; b31 indicates even/odd level */
190             visualLimit,    /* last visual position of the run +1 */
191             insertRemove;   /* if >0, flags for inserting LRM/RLM before/after run,
192                                if <0, count of bidi controls within run            */
193 } Run;
194 
195 /* in a Run, logicalStart will get this bit set if the run level is odd */
196 #define INDEX_ODD_BIT (1UL<<31)
197 
198 #define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)(level)<<31))
199 #define ADD_ODD_BIT_FROM_LEVEL(x, level)  ((x)|=((int32_t)(level)<<31))
200 #define REMOVE_ODD_BIT(x)                 ((x)&=~INDEX_ODD_BIT)
201 
202 #define GET_INDEX(x)   ((x)&~INDEX_ODD_BIT)
203 #define GET_ODD_BIT(x) ((uint32_t)(x)>>31)
204 #define IS_ODD_RUN(x)  ((UBool)(((x)&INDEX_ODD_BIT)!=0))
205 #define IS_EVEN_RUN(x) ((UBool)(((x)&INDEX_ODD_BIT)==0))
206 
207 U_CFUNC UBool
208 ubidi_getRuns(UBiDi *pBiDi, UErrorCode *pErrorCode);
209 
210 /** BiDi control code points */
211 enum {
212     ZWNJ_CHAR=0x200c,
213     ZWJ_CHAR,
214     LRM_CHAR,
215     RLM_CHAR,
216     LRE_CHAR=0x202a,
217     RLE_CHAR,
218     PDF_CHAR,
219     LRO_CHAR,
220     RLO_CHAR,
221     LRI_CHAR=0x2066,
222     RLI_CHAR,
223     FSI_CHAR,
224     PDI_CHAR
225 };
226 
227 #define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffc)==ZWNJ_CHAR || (uint32_t)((c)-LRE_CHAR)<5 || (uint32_t)((c)-LRI_CHAR)<4)
228 
229 /* InsertPoints structure for noting where to put BiDi marks ---------------- */
230 
231 typedef struct Point {
232     int32_t pos;            /* position in text */
233     int32_t flag;           /* flag for LRM/RLM, before/after */
234 } Point;
235 
236 typedef struct InsertPoints {
237     int32_t capacity;       /* number of points allocated */
238     int32_t size;           /* number of points used */
239     int32_t confirmed;      /* number of points confirmed */
240     UErrorCode errorCode;   /* for eventual memory shortage */
241     Point *points;          /* pointer to array of points */
242 } InsertPoints;
243 
244 
245 /* UBiDi structure ----------------------------------------------------------- */
246 
247 struct UBiDi {
248     /* pointer to parent paragraph object (pointer to self if this object is
249      * a paragraph object); set to NULL in a newly opened object; set to a
250      * real value after a successful execution of ubidi_setPara or ubidi_setLine
251      */
252     const UBiDi * pParaBiDi;
253 
254     const UBiDiProps *bdp;
255 
256     /* alias pointer to the current text */
257     const UChar *text;
258 
259     /* length of the current text */
260     int32_t originalLength;
261 
262     /* if the UBIDI_OPTION_STREAMING option is set, this is the length
263      * of text actually processed by ubidi_setPara, which may be shorter than
264      * the original length.
265      * Otherwise, it is identical to the original length.
266      */
267     int32_t length;
268 
269     /* if the UBIDI_OPTION_REMOVE_CONTROLS option is set, and/or
270      * marks are allowed to be inserted in one of the reordering mode, the
271      * length of the result string may be different from the processed length.
272      */
273     int32_t resultLength;
274 
275     /* memory sizes in bytes */
276     int32_t dirPropsSize, levelsSize, openingsSize, parasSize, runsSize, isolatesSize;
277 
278     /* allocated memory */
279     DirProp *dirPropsMemory;
280     UBiDiLevel *levelsMemory;
281     Opening *openingsMemory;
282     Para *parasMemory;
283     Run *runsMemory;
284     Isolate *isolatesMemory;
285 
286     /* indicators for whether memory may be allocated after ubidi_open() */
287     UBool mayAllocateText, mayAllocateRuns;
288 
289     /* arrays with one value per text-character */
290     DirProp *dirProps;
291     UBiDiLevel *levels;
292 
293     /* are we performing an approximation of the "inverse BiDi" algorithm? */
294     UBool isInverse;
295 
296     /* are we using the basic algorithm or its variation? */
297     UBiDiReorderingMode reorderingMode;
298 
299     /* UBIDI_REORDER_xxx values must be ordered so that all the regular
300      * logical to visual modes come first, and all inverse BiDi modes
301      * come last.
302      */
303     #define UBIDI_REORDER_LAST_LOGICAL_TO_VISUAL    UBIDI_REORDER_NUMBERS_SPECIAL
304 
305     /* bitmask for reordering options */
306     uint32_t reorderingOptions;
307 
308     /* must block separators receive level 0? */
309     UBool orderParagraphsLTR;
310 
311     /* the paragraph level */
312     UBiDiLevel paraLevel;
313     /* original paraLevel when contextual */
314     /* must be one of UBIDI_DEFAULT_xxx or 0 if not contextual */
315     UBiDiLevel defaultParaLevel;
316 
317     /* context data */
318     const UChar *prologue;
319     int32_t proLength;
320     const UChar *epilogue;
321     int32_t epiLength;
322 
323     /* the following is set in ubidi_setPara, used in processPropertySeq */
324     const struct ImpTabPair * pImpTabPair;  /* pointer to levels state table pair */
325 
326     /* the overall paragraph or line directionality - see UBiDiDirection */
327     UBiDiDirection direction;
328 
329     /* flags is a bit set for which directional properties are in the text */
330     Flags flags;
331 
332     /* lastArabicPos is index to the last AL in the text, -1 if none */
333     int32_t lastArabicPos;
334 
335     /* characters after trailingWSStart are WS and are */
336     /* implicitly at the paraLevel (rule (L1)) - levels may not reflect that */
337     int32_t trailingWSStart;
338 
339     /* fields for paragraph handling */
340     int32_t paraCount;                  /* set in getDirProps() */
341     /* filled in getDirProps() */
342     Para *paras;
343 
344     /* for relatively short text, we only need a tiny array of paras (no malloc()) */
345     Para simpleParas[SIMPLE_PARAS_COUNT];
346 
347     /* fields for line reordering */
348     int32_t runCount;     /* ==-1: runs not set up yet */
349     Run *runs;
350 
351     /* for non-mixed text, we only need a tiny array of runs (no malloc()) */
352     Run simpleRuns[1];
353 
354     /* maximum or current nesting depth of isolate sequences */
355     /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal
356        nesting encountered.
357        Within resolveImplicitLevels(), this is the index of the current isolates
358        stack entry. */
359     int32_t isolateCount;
360     Isolate *isolates;
361 
362     /* for simple text, have a small stack (no malloc()) */
363     Isolate simpleIsolates[SIMPLE_ISOLATES_COUNT];
364 
365     /* for inverse Bidi with insertion of directional marks */
366     InsertPoints insertPoints;
367 
368     /* for option UBIDI_OPTION_REMOVE_CONTROLS */
369     int32_t controlCount;
370 
371     /* for Bidi class callback */
372     UBiDiClassCallback *fnClassCallback;    /* action pointer */
373     const void *coClassCallback;            /* context pointer */
374 };
375 
376 #define IS_VALID_PARA(x) ((x) && ((x)->pParaBiDi==(x)))
377 #define IS_VALID_PARA_OR_LINE(x) ((x) && ((x)->pParaBiDi==(x) || (((x)->pParaBiDi) && (x)->pParaBiDi->pParaBiDi==(x)->pParaBiDi)))
378 
379 typedef union {
380     DirProp *dirPropsMemory;
381     UBiDiLevel *levelsMemory;
382     Opening *openingsMemory;
383     Para *parasMemory;
384     Run *runsMemory;
385     Isolate *isolatesMemory;
386 } BidiMemoryForAllocation;
387 
388 /* Macros for initial checks at function entry */
389 #define RETURN_IF_NULL_OR_FAILING_ERRCODE(pErrcode, retvalue)   \
390         if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return retvalue
391 #define RETURN_IF_NOT_VALID_PARA(bidi, errcode, retvalue)   \
392         if(!IS_VALID_PARA(bidi)) {  \
393             errcode=U_INVALID_STATE_ERROR;  \
394             return retvalue;                \
395         }
396 #define RETURN_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode, retvalue)   \
397         if(!IS_VALID_PARA_OR_LINE(bidi)) {  \
398             errcode=U_INVALID_STATE_ERROR;  \
399             return retvalue;                \
400         }
401 #define RETURN_IF_BAD_RANGE(arg, start, limit, errcode, retvalue)   \
402         if((arg)<(start) || (arg)>=(limit)) {       \
403             (errcode)=U_ILLEGAL_ARGUMENT_ERROR;     \
404             return retvalue;                        \
405         }
406 
407 #define RETURN_VOID_IF_NULL_OR_FAILING_ERRCODE(pErrcode)   \
408         if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return
409 #define RETURN_VOID_IF_NOT_VALID_PARA(bidi, errcode)   \
410         if(!IS_VALID_PARA(bidi)) {  \
411             errcode=U_INVALID_STATE_ERROR;  \
412             return;                \
413         }
414 #define RETURN_VOID_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode)   \
415         if(!IS_VALID_PARA_OR_LINE(bidi)) {  \
416             errcode=U_INVALID_STATE_ERROR;  \
417             return;                \
418         }
419 #define RETURN_VOID_IF_BAD_RANGE(arg, start, limit, errcode)   \
420         if((arg)<(start) || (arg)>=(limit)) {       \
421             (errcode)=U_ILLEGAL_ARGUMENT_ERROR;     \
422             return;                        \
423         }
424 
425 /* helper function to (re)allocate memory if allowed */
426 U_CFUNC UBool
427 ubidi_getMemory(BidiMemoryForAllocation *pMemory, int32_t *pSize, UBool mayAllocate, int32_t sizeNeeded);
428 
429 /* helper macros for each allocated array in UBiDi */
430 #define getDirPropsMemory(pBiDi, length) \
431         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \
432                         (pBiDi)->mayAllocateText, (length))
433 
434 #define getLevelsMemory(pBiDi, length) \
435         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \
436                         (pBiDi)->mayAllocateText, (length))
437 
438 #define getRunsMemory(pBiDi, length) \
439         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \
440                         (pBiDi)->mayAllocateRuns, (length)*sizeof(Run))
441 
442 /* additional macros used by ubidi_open() - always allow allocation */
443 #define getInitialDirPropsMemory(pBiDi, length) \
444         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \
445                         TRUE, (length))
446 
447 #define getInitialLevelsMemory(pBiDi, length) \
448         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \
449                         TRUE, (length))
450 
451 #define getInitialOpeningsMemory(pBiDi, length) \
452         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->openingsMemory, &(pBiDi)->openingsSize, \
453                         TRUE, (length)*sizeof(Opening))
454 
455 #define getInitialParasMemory(pBiDi, length) \
456         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->parasMemory, &(pBiDi)->parasSize, \
457                         TRUE, (length)*sizeof(Para))
458 
459 #define getInitialRunsMemory(pBiDi, length) \
460         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \
461                         TRUE, (length)*sizeof(Run))
462 
463 #define getInitialIsolatesMemory(pBiDi, length) \
464         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->isolatesMemory, &(pBiDi)->isolatesSize, \
465                         TRUE, (length)*sizeof(Isolate))
466 
467 #endif
468