1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2000-2015, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  ubidiwrt.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 1999aug06
14 *   created by: Markus W. Scherer, updated by Matitiahu Allouche
15 *
16 * This file contains implementations for BiDi functions that use
17 * the core algorithm and core API to write reordered text.
18 */
19 
20 #include "unicode/utypes.h"
21 #include "unicode/ustring.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ubidi.h"
24 #include "unicode/utf16.h"
25 #include "cmemory.h"
26 #include "ustr_imp.h"
27 #include "ubidiimp.h"
28 
29 /*
30  * The function implementations in this file are designed
31  * for UTF-16 and UTF-32, not for UTF-8.
32  *
33  * Assumptions that are not true for UTF-8:
34  * - Any code point always needs the same number of code units
35  *   ("minimum-length-problem" of UTF-8)
36  * - The BiDi control characters need only one code unit each
37  *
38  * Further assumptions for all UTFs:
39  * - u_charMirror(c) needs the same number of code units as c
40  */
41 #if UTF_SIZE==8
42 # error reimplement ubidi_writeReordered() for UTF-8, see comment above
43 #endif
44 
45 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
46 
47 /*
48  * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
49  * semantically write RTL runs in reverse and later reverse them again.
50  * Instead, we actually write them in forward order to begin with.
51  * However, if the RTL run was to be mirrored, we need to mirror here now
52  * since the implicit second reversal must not do it.
53  * It looks strange to do mirroring in LTR output, but it is only because
54  * we are writing RTL output in reverse.
55  */
56 static int32_t
doWriteForward(const UChar * src,int32_t srcLength,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)57 doWriteForward(const UChar *src, int32_t srcLength,
58                UChar *dest, int32_t destSize,
59                uint16_t options,
60                UErrorCode *pErrorCode) {
61     /* optimize for several combinations of options */
62     switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
63     case 0: {
64         /* simply copy the LTR run to the destination */
65         int32_t length=srcLength;
66         if(destSize<length) {
67             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
68             return srcLength;
69         }
70         do {
71             *dest++=*src++;
72         } while(--length>0);
73         return srcLength;
74     }
75     case UBIDI_DO_MIRRORING: {
76         /* do mirroring */
77         int32_t i=0, j=0;
78         UChar32 c;
79 
80         if(destSize<srcLength) {
81             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
82             return srcLength;
83         }
84         do {
85             U16_NEXT(src, i, srcLength, c);
86             c=u_charMirror(c);
87             U16_APPEND_UNSAFE(dest, j, c);
88         } while(i<srcLength);
89         return srcLength;
90     }
91     case UBIDI_REMOVE_BIDI_CONTROLS: {
92         /* copy the LTR run and remove any BiDi control characters */
93         int32_t remaining=destSize;
94         UChar c;
95         do {
96             c=*src++;
97             if(!IS_BIDI_CONTROL_CHAR(c)) {
98                 if(--remaining<0) {
99                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
100 
101                     /* preflight the length */
102                     while(--srcLength>0) {
103                         c=*src++;
104                         if(!IS_BIDI_CONTROL_CHAR(c)) {
105                             --remaining;
106                         }
107                     }
108                     return destSize-remaining;
109                 }
110                 *dest++=c;
111             }
112         } while(--srcLength>0);
113         return destSize-remaining;
114     }
115     default: {
116         /* remove BiDi control characters and do mirroring */
117         int32_t remaining=destSize;
118         int32_t i, j=0;
119         UChar32 c;
120         do {
121             i=0;
122             U16_NEXT(src, i, srcLength, c);
123             src+=i;
124             srcLength-=i;
125             if(!IS_BIDI_CONTROL_CHAR(c)) {
126                 remaining-=i;
127                 if(remaining<0) {
128                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
129 
130                     /* preflight the length */
131                     while(srcLength>0) {
132                         c=*src++;
133                         if(!IS_BIDI_CONTROL_CHAR(c)) {
134                             --remaining;
135                         }
136                         --srcLength;
137                     }
138                     return destSize-remaining;
139                 }
140                 c=u_charMirror(c);
141                 U16_APPEND_UNSAFE(dest, j, c);
142             }
143         } while(srcLength>0);
144         return j;
145     }
146     } /* end of switch */
147 }
148 
149 static int32_t
doWriteReverse(const UChar * src,int32_t srcLength,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)150 doWriteReverse(const UChar *src, int32_t srcLength,
151                UChar *dest, int32_t destSize,
152                uint16_t options,
153                UErrorCode *pErrorCode) {
154     /*
155      * RTL run -
156      *
157      * RTL runs need to be copied to the destination in reverse order
158      * of code points, not code units, to keep Unicode characters intact.
159      *
160      * The general strategy for this is to read the source text
161      * in backward order, collect all code units for a code point
162      * (and optionally following combining characters, see below),
163      * and copy all these code units in ascending order
164      * to the destination for this run.
165      *
166      * Several options request whether combining characters
167      * should be kept after their base characters,
168      * whether BiDi control characters should be removed, and
169      * whether characters should be replaced by their mirror-image
170      * equivalent Unicode characters.
171      */
172     int32_t i, j;
173     UChar32 c;
174 
175     /* optimize for several combinations of options */
176     switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
177     case 0:
178         /*
179          * With none of the "complicated" options set, the destination
180          * run will have the same length as the source run,
181          * and there is no mirroring and no keeping combining characters
182          * with their base characters.
183          */
184         if(destSize<srcLength) {
185             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
186             return srcLength;
187         }
188         destSize=srcLength;
189 
190         /* preserve character integrity */
191         do {
192             /* i is always after the last code unit known to need to be kept in this segment */
193             i=srcLength;
194 
195             /* collect code units for one base character */
196             U16_BACK_1(src, 0, srcLength);
197 
198             /* copy this base character */
199             j=srcLength;
200             do {
201                 *dest++=src[j++];
202             } while(j<i);
203         } while(srcLength>0);
204         break;
205     case UBIDI_KEEP_BASE_COMBINING:
206         /*
207          * Here, too, the destination
208          * run will have the same length as the source run,
209          * and there is no mirroring.
210          * We do need to keep combining characters with their base characters.
211          */
212         if(destSize<srcLength) {
213             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
214             return srcLength;
215         }
216         destSize=srcLength;
217 
218         /* preserve character integrity */
219         do {
220             /* i is always after the last code unit known to need to be kept in this segment */
221             i=srcLength;
222 
223             /* collect code units and modifier letters for one base character */
224             do {
225                 U16_PREV(src, 0, srcLength, c);
226             } while(srcLength>0 && IS_COMBINING(u_charType(c)));
227 
228             /* copy this "user character" */
229             j=srcLength;
230             do {
231                 *dest++=src[j++];
232             } while(j<i);
233         } while(srcLength>0);
234         break;
235     default:
236         /*
237          * With several "complicated" options set, this is the most
238          * general and the slowest copying of an RTL run.
239          * We will do mirroring, remove BiDi controls, and
240          * keep combining characters with their base characters
241          * as requested.
242          */
243         if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
244             i=srcLength;
245         } else {
246             /* we need to find out the destination length of the run,
247                which will not include the BiDi control characters */
248             int32_t length=srcLength;
249             UChar ch;
250 
251             i=0;
252             do {
253                 ch=*src++;
254                 if(!IS_BIDI_CONTROL_CHAR(ch)) {
255                     ++i;
256                 }
257             } while(--length>0);
258             src-=srcLength;
259         }
260 
261         if(destSize<i) {
262             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
263             return i;
264         }
265         destSize=i;
266 
267         /* preserve character integrity */
268         do {
269             /* i is always after the last code unit known to need to be kept in this segment */
270             i=srcLength;
271 
272             /* collect code units for one base character */
273             U16_PREV(src, 0, srcLength, c);
274             if(options&UBIDI_KEEP_BASE_COMBINING) {
275                 /* collect modifier letters for this base character */
276                 while(srcLength>0 && IS_COMBINING(u_charType(c))) {
277                     U16_PREV(src, 0, srcLength, c);
278                 }
279             }
280 
281             if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
282                 /* do not copy this BiDi control character */
283                 continue;
284             }
285 
286             /* copy this "user character" */
287             j=srcLength;
288             if(options&UBIDI_DO_MIRRORING) {
289                 /* mirror only the base character */
290                 int32_t k=0;
291                 c=u_charMirror(c);
292                 U16_APPEND_UNSAFE(dest, k, c);
293                 dest+=k;
294                 j+=k;
295             }
296             while(j<i) {
297                 *dest++=src[j++];
298             }
299         } while(srcLength>0);
300         break;
301     } /* end of switch */
302 
303     return destSize;
304 }
305 
306 U_CAPI int32_t U_EXPORT2
ubidi_writeReverse(const UChar * src,int32_t srcLength,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)307 ubidi_writeReverse(const UChar *src, int32_t srcLength,
308                    UChar *dest, int32_t destSize,
309                    uint16_t options,
310                    UErrorCode *pErrorCode) {
311     int32_t destLength;
312 
313     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
314         return 0;
315     }
316 
317     /* more error checking */
318     if( src==NULL || srcLength<-1 ||
319         destSize<0 || (destSize>0 && dest==NULL))
320     {
321         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
322         return 0;
323     }
324 
325     /* do input and output overlap? */
326     if( dest!=NULL &&
327         ((src>=dest && src<dest+destSize) ||
328          (dest>=src && dest<src+srcLength)))
329     {
330         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
331         return 0;
332     }
333 
334     if(srcLength==-1) {
335         srcLength=u_strlen(src);
336     }
337     if(srcLength>0) {
338         destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode);
339     } else {
340         /* nothing to do */
341         destLength=0;
342     }
343 
344     return u_terminateUChars(dest, destSize, destLength, pErrorCode);
345 }
346 
347 U_CAPI int32_t U_EXPORT2
ubidi_writeReordered(UBiDi * pBiDi,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)348 ubidi_writeReordered(UBiDi *pBiDi,
349                      UChar *dest, int32_t destSize,
350                      uint16_t options,
351                      UErrorCode *pErrorCode) {
352     const UChar *text;
353     UChar *saveDest;
354     int32_t length, destCapacity;
355     int32_t run, runCount, logicalStart, runLength;
356 
357     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
358         return 0;
359     }
360 
361     /* more error checking */
362     if( pBiDi==NULL ||
363         (text=pBiDi->text)==NULL || (length=pBiDi->length)<0 ||
364         destSize<0 || (destSize>0 && dest==NULL))
365     {
366         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
367         return 0;
368     }
369 
370     /* do input and output overlap? */
371     if( dest!=NULL &&
372         ((text>=dest && text<dest+destSize) ||
373          (dest>=text && dest<text+pBiDi->originalLength)))
374     {
375         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
376         return 0;
377     }
378 
379     if(length==0) {
380         /* nothing to do */
381         return u_terminateUChars(dest, destSize, 0, pErrorCode);
382     }
383 
384     runCount=ubidi_countRuns(pBiDi, pErrorCode);
385     if(U_FAILURE(*pErrorCode)) {
386         return 0;
387     }
388 
389     /* destSize shrinks, later destination length=destCapacity-destSize */
390     saveDest=dest;
391     destCapacity=destSize;
392 
393     /*
394      * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the
395      * reordering mode (checked below) is appropriate.
396      */
397     if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) {
398         options|=UBIDI_INSERT_LRM_FOR_NUMERIC;
399         options&=~UBIDI_REMOVE_BIDI_CONTROLS;
400     }
401     /*
402      * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS
403      * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC.
404      */
405     if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) {
406         options|=UBIDI_REMOVE_BIDI_CONTROLS;
407         options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
408     }
409     /*
410      * If we do not perform the "inverse BiDi" algorithm, then we
411      * don't need to insert any LRMs, and don't need to test for it.
412      */
413     if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) &&
414        (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT)  &&
415        (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
416        (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) {
417         options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
418     }
419     /*
420      * Iterate through all visual runs and copy the run text segments to
421      * the destination, according to the options.
422      *
423      * The tests for where to insert LRMs ignore the fact that there may be
424      * BN codes or non-BMP code points at the beginning and end of a run;
425      * they may insert LRMs unnecessarily but the tests are faster this way
426      * (this would have to be improved for UTF-8).
427      *
428      * Note that the only errors that are set by doWriteXY() are buffer overflow
429      * errors. Ignore them until the end, and continue for preflighting.
430      */
431     if(!(options&UBIDI_OUTPUT_REVERSE)) {
432         /* forward output */
433         if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
434             /* do not insert BiDi controls */
435             for(run=0; run<runCount; ++run) {
436                 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
437                     runLength=doWriteForward(text+logicalStart, runLength,
438                                              dest, destSize,
439                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
440                 } else {
441                     runLength=doWriteReverse(text+logicalStart, runLength,
442                                              dest, destSize,
443                                              options, pErrorCode);
444                 }
445                 if(dest!=NULL) {
446                   dest+=runLength;
447                 }
448                 destSize-=runLength;
449             }
450         } else {
451             /* insert BiDi controls for "inverse BiDi" */
452             const DirProp *dirProps=pBiDi->dirProps;
453             const UChar *src;
454             UChar uc;
455             UBiDiDirection dir;
456             int32_t markFlag;
457 
458             for(run=0; run<runCount; ++run) {
459                 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
460                 src=text+logicalStart;
461                 /* check if something relevant in insertPoints */
462                 markFlag=pBiDi->runs[run].insertRemove;
463                 if(markFlag<0) {        /* BiDi controls count */
464                     markFlag=0;
465                 }
466 
467                 if(UBIDI_LTR==dir) {
468                     if((pBiDi->isInverse) &&
469                        (/*run>0 &&*/ dirProps[logicalStart]!=L)) {
470                         markFlag |= LRM_BEFORE;
471                     }
472                     if (markFlag & LRM_BEFORE) {
473                         uc=LRM_CHAR;
474                     }
475                     else if (markFlag & RLM_BEFORE) {
476                         uc=RLM_CHAR;
477                     }
478                     else  uc=0;
479                     if(uc) {
480                         if(destSize>0) {
481                             *dest++=uc;
482                         }
483                         --destSize;
484                     }
485 
486                     runLength=doWriteForward(src, runLength,
487                                              dest, destSize,
488                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
489                     if(dest!=NULL) {
490                       dest+=runLength;
491                     }
492                     destSize-=runLength;
493 
494                     if((pBiDi->isInverse) &&
495                        (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) {
496                         markFlag |= LRM_AFTER;
497                     }
498                     if (markFlag & LRM_AFTER) {
499                         uc=LRM_CHAR;
500                     }
501                     else if (markFlag & RLM_AFTER) {
502                         uc=RLM_CHAR;
503                     }
504                     else  uc=0;
505                     if(uc) {
506                         if(destSize>0) {
507                             *dest++=uc;
508                         }
509                         --destSize;
510                     }
511                 } else {                /* RTL run */
512                     if((pBiDi->isInverse) &&
513                        (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) {
514                         markFlag |= RLM_BEFORE;
515                     }
516                     if (markFlag & LRM_BEFORE) {
517                         uc=LRM_CHAR;
518                     }
519                     else if (markFlag & RLM_BEFORE) {
520                         uc=RLM_CHAR;
521                     }
522                     else  uc=0;
523                     if(uc) {
524                         if(destSize>0) {
525                             *dest++=uc;
526                         }
527                         --destSize;
528                     }
529 
530                     runLength=doWriteReverse(src, runLength,
531                                              dest, destSize,
532                                              options, pErrorCode);
533                     if(dest!=NULL) {
534                       dest+=runLength;
535                     }
536                     destSize-=runLength;
537 
538                     if((pBiDi->isInverse) &&
539                        (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) {
540                         markFlag |= RLM_AFTER;
541                     }
542                     if (markFlag & LRM_AFTER) {
543                         uc=LRM_CHAR;
544                     }
545                     else if (markFlag & RLM_AFTER) {
546                         uc=RLM_CHAR;
547                     }
548                     else  uc=0;
549                     if(uc) {
550                         if(destSize>0) {
551                             *dest++=uc;
552                         }
553                         --destSize;
554                     }
555                 }
556             }
557         }
558     } else {
559         /* reverse output */
560         if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
561             /* do not insert BiDi controls */
562             for(run=runCount; --run>=0;) {
563                 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
564                     runLength=doWriteReverse(text+logicalStart, runLength,
565                                              dest, destSize,
566                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
567                 } else {
568                     runLength=doWriteForward(text+logicalStart, runLength,
569                                              dest, destSize,
570                                              options, pErrorCode);
571                 }
572                 if(dest!=NULL) {
573                   dest+=runLength;
574                 }
575                 destSize-=runLength;
576             }
577         } else {
578             /* insert BiDi controls for "inverse BiDi" */
579             const DirProp *dirProps=pBiDi->dirProps;
580             const UChar *src;
581             UBiDiDirection dir;
582 
583             for(run=runCount; --run>=0;) {
584                 /* reverse output */
585                 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
586                 src=text+logicalStart;
587 
588                 if(UBIDI_LTR==dir) {
589                     if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) {
590                         if(destSize>0) {
591                             *dest++=LRM_CHAR;
592                         }
593                         --destSize;
594                     }
595 
596                     runLength=doWriteReverse(src, runLength,
597                                              dest, destSize,
598                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
599                     if(dest!=NULL) {
600                       dest+=runLength;
601                     }
602                     destSize-=runLength;
603 
604                     if(/*run>0 &&*/ dirProps[logicalStart]!=L) {
605                         if(destSize>0) {
606                             *dest++=LRM_CHAR;
607                         }
608                         --destSize;
609                     }
610                 } else {
611                     if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) {
612                         if(destSize>0) {
613                             *dest++=RLM_CHAR;
614                         }
615                         --destSize;
616                     }
617 
618                     runLength=doWriteForward(src, runLength,
619                                              dest, destSize,
620                                              options, pErrorCode);
621                     if(dest!=NULL) {
622                       dest+=runLength;
623                     }
624                     destSize-=runLength;
625 
626                     if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) {
627                         if(destSize>0) {
628                             *dest++=RLM_CHAR;
629                         }
630                         --destSize;
631                     }
632                 }
633             }
634         }
635     }
636 
637     return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode);
638 }
639