1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 1998-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *
11 * File ustring.cpp
12 *
13 * Modification History:
14 *
15 *   Date        Name        Description
16 *   12/07/98    bertrand    Creation.
17 ******************************************************************************
18 */
19 
20 #include "unicode/utypes.h"
21 #include "unicode/putil.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ustring.h"
24 #include "unicode/utf16.h"
25 #include "cstring.h"
26 #include "cwchar.h"
27 #include "cmemory.h"
28 #include "ustr_imp.h"
29 
30 /* ANSI string.h - style functions ------------------------------------------ */
31 
32 /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
33 #define U_BMP_MAX 0xffff
34 
35 /* Forward binary string search functions ----------------------------------- */
36 
37 /*
38  * Test if a substring match inside a string is at code point boundaries.
39  * All pointers refer to the same buffer.
40  * The limit pointer may be NULL, all others must be real pointers.
41  */
42 static inline UBool
isMatchAtCPBoundary(const UChar * start,const UChar * match,const UChar * matchLimit,const UChar * limit)43 isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) {
44     if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
45         /* the leading edge of the match is in the middle of a surrogate pair */
46         return FALSE;
47     }
48     if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) {
49         /* the trailing edge of the match is in the middle of a surrogate pair */
50         return FALSE;
51     }
52     return TRUE;
53 }
54 
55 U_CAPI UChar * U_EXPORT2
u_strFindFirst(const UChar * s,int32_t length,const UChar * sub,int32_t subLength)56 u_strFindFirst(const UChar *s, int32_t length,
57                const UChar *sub, int32_t subLength) {
58     const UChar *start, *p, *q, *subLimit;
59     UChar c, cs, cq;
60 
61     if(sub==NULL || subLength<-1) {
62         return (UChar *)s;
63     }
64     if(s==NULL || length<-1) {
65         return NULL;
66     }
67 
68     start=s;
69 
70     if(length<0 && subLength<0) {
71         /* both strings are NUL-terminated */
72         if((cs=*sub++)==0) {
73             return (UChar *)s;
74         }
75         if(*sub==0 && !U16_IS_SURROGATE(cs)) {
76             /* the substring consists of a single, non-surrogate BMP code point */
77             return u_strchr(s, cs);
78         }
79 
80         while((c=*s++)!=0) {
81             if(c==cs) {
82                 /* found first substring UChar, compare rest */
83                 p=s;
84                 q=sub;
85                 for(;;) {
86                     if((cq=*q)==0) {
87                         if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
88                             return (UChar *)(s-1); /* well-formed match */
89                         } else {
90                             break; /* no match because surrogate pair is split */
91                         }
92                     }
93                     if((c=*p)==0) {
94                         return NULL; /* no match, and none possible after s */
95                     }
96                     if(c!=cq) {
97                         break; /* no match */
98                     }
99                     ++p;
100                     ++q;
101                 }
102             }
103         }
104 
105         /* not found */
106         return NULL;
107     }
108 
109     if(subLength<0) {
110         subLength=u_strlen(sub);
111     }
112     if(subLength==0) {
113         return (UChar *)s;
114     }
115 
116     /* get sub[0] to search for it fast */
117     cs=*sub++;
118     --subLength;
119     subLimit=sub+subLength;
120 
121     if(subLength==0 && !U16_IS_SURROGATE(cs)) {
122         /* the substring consists of a single, non-surrogate BMP code point */
123         return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
124     }
125 
126     if(length<0) {
127         /* s is NUL-terminated */
128         while((c=*s++)!=0) {
129             if(c==cs) {
130                 /* found first substring UChar, compare rest */
131                 p=s;
132                 q=sub;
133                 for(;;) {
134                     if(q==subLimit) {
135                         if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
136                             return (UChar *)(s-1); /* well-formed match */
137                         } else {
138                             break; /* no match because surrogate pair is split */
139                         }
140                     }
141                     if((c=*p)==0) {
142                         return NULL; /* no match, and none possible after s */
143                     }
144                     if(c!=*q) {
145                         break; /* no match */
146                     }
147                     ++p;
148                     ++q;
149                 }
150             }
151         }
152     } else {
153         const UChar *limit, *preLimit;
154 
155         /* subLength was decremented above */
156         if(length<=subLength) {
157             return NULL; /* s is shorter than sub */
158         }
159 
160         limit=s+length;
161 
162         /* the substring must start before preLimit */
163         preLimit=limit-subLength;
164 
165         while(s!=preLimit) {
166             c=*s++;
167             if(c==cs) {
168                 /* found first substring UChar, compare rest */
169                 p=s;
170                 q=sub;
171                 for(;;) {
172                     if(q==subLimit) {
173                         if(isMatchAtCPBoundary(start, s-1, p, limit)) {
174                             return (UChar *)(s-1); /* well-formed match */
175                         } else {
176                             break; /* no match because surrogate pair is split */
177                         }
178                     }
179                     if(*p!=*q) {
180                         break; /* no match */
181                     }
182                     ++p;
183                     ++q;
184                 }
185             }
186         }
187     }
188 
189     /* not found */
190     return NULL;
191 }
192 
193 U_CAPI UChar * U_EXPORT2
u_strstr(const UChar * s,const UChar * substring)194 u_strstr(const UChar *s, const UChar *substring) {
195     return u_strFindFirst(s, -1, substring, -1);
196 }
197 
198 U_CAPI UChar * U_EXPORT2
u_strchr(const UChar * s,UChar c)199 u_strchr(const UChar *s, UChar c) {
200     if(U16_IS_SURROGATE(c)) {
201         /* make sure to not find half of a surrogate pair */
202         return u_strFindFirst(s, -1, &c, 1);
203     } else {
204         UChar cs;
205 
206         /* trivial search for a BMP code point */
207         for(;;) {
208             if((cs=*s)==c) {
209                 return (UChar *)s;
210             }
211             if(cs==0) {
212                 return NULL;
213             }
214             ++s;
215         }
216     }
217 }
218 
219 U_CAPI UChar * U_EXPORT2
u_strchr32(const UChar * s,UChar32 c)220 u_strchr32(const UChar *s, UChar32 c) {
221     if((uint32_t)c<=U_BMP_MAX) {
222         /* find BMP code point */
223         return u_strchr(s, (UChar)c);
224     } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
225         /* find supplementary code point as surrogate pair */
226         UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
227 
228         while((cs=*s++)!=0) {
229             if(cs==lead && *s==trail) {
230                 return (UChar *)(s-1);
231             }
232         }
233         return NULL;
234     } else {
235         /* not a Unicode code point, not findable */
236         return NULL;
237     }
238 }
239 
240 U_CAPI UChar * U_EXPORT2
u_memchr(const UChar * s,UChar c,int32_t count)241 u_memchr(const UChar *s, UChar c, int32_t count) {
242     if(count<=0) {
243         return NULL; /* no string */
244     } else if(U16_IS_SURROGATE(c)) {
245         /* make sure to not find half of a surrogate pair */
246         return u_strFindFirst(s, count, &c, 1);
247     } else {
248         /* trivial search for a BMP code point */
249         const UChar *limit=s+count;
250         do {
251             if(*s==c) {
252                 return (UChar *)s;
253             }
254         } while(++s!=limit);
255         return NULL;
256     }
257 }
258 
259 U_CAPI UChar * U_EXPORT2
u_memchr32(const UChar * s,UChar32 c,int32_t count)260 u_memchr32(const UChar *s, UChar32 c, int32_t count) {
261     if((uint32_t)c<=U_BMP_MAX) {
262         /* find BMP code point */
263         return u_memchr(s, (UChar)c, count);
264     } else if(count<2) {
265         /* too short for a surrogate pair */
266         return NULL;
267     } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
268         /* find supplementary code point as surrogate pair */
269         const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
270         UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
271 
272         do {
273             if(*s==lead && *(s+1)==trail) {
274                 return (UChar *)s;
275             }
276         } while(++s!=limit);
277         return NULL;
278     } else {
279         /* not a Unicode code point, not findable */
280         return NULL;
281     }
282 }
283 
284 /* Backward binary string search functions ---------------------------------- */
285 
286 U_CAPI UChar * U_EXPORT2
u_strFindLast(const UChar * s,int32_t length,const UChar * sub,int32_t subLength)287 u_strFindLast(const UChar *s, int32_t length,
288               const UChar *sub, int32_t subLength) {
289     const UChar *start, *limit, *p, *q, *subLimit;
290     UChar c, cs;
291 
292     if(sub==NULL || subLength<-1) {
293         return (UChar *)s;
294     }
295     if(s==NULL || length<-1) {
296         return NULL;
297     }
298 
299     /*
300      * This implementation is more lazy than the one for u_strFindFirst():
301      * There is no special search code for NUL-terminated strings.
302      * It does not seem to be worth it for searching substrings to
303      * search forward and find all matches like in u_strrchr() and similar.
304      * Therefore, we simply get both string lengths and search backward.
305      *
306      * markus 2002oct23
307      */
308 
309     if(subLength<0) {
310         subLength=u_strlen(sub);
311     }
312     if(subLength==0) {
313         return (UChar *)s;
314     }
315 
316     /* get sub[subLength-1] to search for it fast */
317     subLimit=sub+subLength;
318     cs=*(--subLimit);
319     --subLength;
320 
321     if(subLength==0 && !U16_IS_SURROGATE(cs)) {
322         /* the substring consists of a single, non-surrogate BMP code point */
323         return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
324     }
325 
326     if(length<0) {
327         length=u_strlen(s);
328     }
329 
330     /* subLength was decremented above */
331     if(length<=subLength) {
332         return NULL; /* s is shorter than sub */
333     }
334 
335     start=s;
336     limit=s+length;
337 
338     /* the substring must start no later than s+subLength */
339     s+=subLength;
340 
341     while(s!=limit) {
342         c=*(--limit);
343         if(c==cs) {
344             /* found last substring UChar, compare rest */
345             p=limit;
346             q=subLimit;
347             for(;;) {
348                 if(q==sub) {
349                     if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
350                         return (UChar *)p; /* well-formed match */
351                     } else {
352                         break; /* no match because surrogate pair is split */
353                     }
354                 }
355                 if(*(--p)!=*(--q)) {
356                     break; /* no match */
357                 }
358             }
359         }
360     }
361 
362     /* not found */
363     return NULL;
364 }
365 
366 U_CAPI UChar * U_EXPORT2
u_strrstr(const UChar * s,const UChar * substring)367 u_strrstr(const UChar *s, const UChar *substring) {
368     return u_strFindLast(s, -1, substring, -1);
369 }
370 
371 U_CAPI UChar * U_EXPORT2
u_strrchr(const UChar * s,UChar c)372 u_strrchr(const UChar *s, UChar c) {
373     if(U16_IS_SURROGATE(c)) {
374         /* make sure to not find half of a surrogate pair */
375         return u_strFindLast(s, -1, &c, 1);
376     } else {
377         const UChar *result=NULL;
378         UChar cs;
379 
380         /* trivial search for a BMP code point */
381         for(;;) {
382             if((cs=*s)==c) {
383                 result=s;
384             }
385             if(cs==0) {
386                 return (UChar *)result;
387             }
388             ++s;
389         }
390     }
391 }
392 
393 U_CAPI UChar * U_EXPORT2
u_strrchr32(const UChar * s,UChar32 c)394 u_strrchr32(const UChar *s, UChar32 c) {
395     if((uint32_t)c<=U_BMP_MAX) {
396         /* find BMP code point */
397         return u_strrchr(s, (UChar)c);
398     } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
399         /* find supplementary code point as surrogate pair */
400         const UChar *result=NULL;
401         UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
402 
403         while((cs=*s++)!=0) {
404             if(cs==lead && *s==trail) {
405                 result=s-1;
406             }
407         }
408         return (UChar *)result;
409     } else {
410         /* not a Unicode code point, not findable */
411         return NULL;
412     }
413 }
414 
415 U_CAPI UChar * U_EXPORT2
u_memrchr(const UChar * s,UChar c,int32_t count)416 u_memrchr(const UChar *s, UChar c, int32_t count) {
417     if(count<=0) {
418         return NULL; /* no string */
419     } else if(U16_IS_SURROGATE(c)) {
420         /* make sure to not find half of a surrogate pair */
421         return u_strFindLast(s, count, &c, 1);
422     } else {
423         /* trivial search for a BMP code point */
424         const UChar *limit=s+count;
425         do {
426             if(*(--limit)==c) {
427                 return (UChar *)limit;
428             }
429         } while(s!=limit);
430         return NULL;
431     }
432 }
433 
434 U_CAPI UChar * U_EXPORT2
u_memrchr32(const UChar * s,UChar32 c,int32_t count)435 u_memrchr32(const UChar *s, UChar32 c, int32_t count) {
436     if((uint32_t)c<=U_BMP_MAX) {
437         /* find BMP code point */
438         return u_memrchr(s, (UChar)c, count);
439     } else if(count<2) {
440         /* too short for a surrogate pair */
441         return NULL;
442     } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
443         /* find supplementary code point as surrogate pair */
444         const UChar *limit=s+count-1;
445         UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
446 
447         do {
448             if(*limit==trail && *(limit-1)==lead) {
449                 return (UChar *)(limit-1);
450             }
451         } while(s!=--limit);
452         return NULL;
453     } else {
454         /* not a Unicode code point, not findable */
455         return NULL;
456     }
457 }
458 
459 /* Tokenization functions --------------------------------------------------- */
460 
461 /*
462  * Match each code point in a string against each code point in the matchSet.
463  * Return the index of the first string code point that
464  * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
465  * Return -(string length)-1 if there is no such code point.
466  */
467 static int32_t
_matchFromSet(const UChar * string,const UChar * matchSet,UBool polarity)468 _matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) {
469     int32_t matchLen, matchBMPLen, strItr, matchItr;
470     UChar32 stringCh, matchCh;
471     UChar c, c2;
472 
473     /* first part of matchSet contains only BMP code points */
474     matchBMPLen = 0;
475     while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
476         ++matchBMPLen;
477     }
478 
479     /* second part of matchSet contains BMP and supplementary code points */
480     matchLen = matchBMPLen;
481     while(matchSet[matchLen] != 0) {
482         ++matchLen;
483     }
484 
485     for(strItr = 0; (c = string[strItr]) != 0;) {
486         ++strItr;
487         if(U16_IS_SINGLE(c)) {
488             if(polarity) {
489                 for(matchItr = 0; matchItr < matchLen; ++matchItr) {
490                     if(c == matchSet[matchItr]) {
491                         return strItr - 1; /* one matches */
492                     }
493                 }
494             } else {
495                 for(matchItr = 0; matchItr < matchLen; ++matchItr) {
496                     if(c == matchSet[matchItr]) {
497                         goto endloop;
498                     }
499                 }
500                 return strItr - 1; /* none matches */
501             }
502         } else {
503             /*
504              * No need to check for string length before U16_IS_TRAIL
505              * because c2 could at worst be the terminating NUL.
506              */
507             if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
508                 ++strItr;
509                 stringCh = U16_GET_SUPPLEMENTARY(c, c2);
510             } else {
511                 stringCh = c; /* unpaired trail surrogate */
512             }
513 
514             if(polarity) {
515                 for(matchItr = matchBMPLen; matchItr < matchLen;) {
516                     U16_NEXT(matchSet, matchItr, matchLen, matchCh);
517                     if(stringCh == matchCh) {
518                         return strItr - U16_LENGTH(stringCh); /* one matches */
519                     }
520                 }
521             } else {
522                 for(matchItr = matchBMPLen; matchItr < matchLen;) {
523                     U16_NEXT(matchSet, matchItr, matchLen, matchCh);
524                     if(stringCh == matchCh) {
525                         goto endloop;
526                     }
527                 }
528                 return strItr - U16_LENGTH(stringCh); /* none matches */
529             }
530         }
531 endloop:
532         /* wish C had continue with labels like Java... */;
533     }
534 
535     /* Didn't find it. */
536     return -strItr-1;
537 }
538 
539 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
540 U_CAPI UChar * U_EXPORT2
u_strpbrk(const UChar * string,const UChar * matchSet)541 u_strpbrk(const UChar *string, const UChar *matchSet)
542 {
543     int32_t idx = _matchFromSet(string, matchSet, TRUE);
544     if(idx >= 0) {
545         return (UChar *)string + idx;
546     } else {
547         return NULL;
548     }
549 }
550 
551 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
552 U_CAPI int32_t U_EXPORT2
u_strcspn(const UChar * string,const UChar * matchSet)553 u_strcspn(const UChar *string, const UChar *matchSet)
554 {
555     int32_t idx = _matchFromSet(string, matchSet, TRUE);
556     if(idx >= 0) {
557         return idx;
558     } else {
559         return -idx - 1; /* == u_strlen(string) */
560     }
561 }
562 
563 /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
564 U_CAPI int32_t U_EXPORT2
u_strspn(const UChar * string,const UChar * matchSet)565 u_strspn(const UChar *string, const UChar *matchSet)
566 {
567     int32_t idx = _matchFromSet(string, matchSet, FALSE);
568     if(idx >= 0) {
569         return idx;
570     } else {
571         return -idx - 1; /* == u_strlen(string) */
572     }
573 }
574 
575 /* ----- Text manipulation functions --- */
576 
577 U_CAPI UChar* U_EXPORT2
u_strtok_r(UChar * src,const UChar * delim,UChar ** saveState)578 u_strtok_r(UChar    *src,
579      const UChar    *delim,
580            UChar   **saveState)
581 {
582     UChar *tokSource;
583     UChar *nextToken;
584     uint32_t nonDelimIdx;
585 
586     /* If saveState is NULL, the user messed up. */
587     if (src != NULL) {
588         tokSource = src;
589         *saveState = src; /* Set to "src" in case there are no delimiters */
590     }
591     else if (*saveState) {
592         tokSource = *saveState;
593     }
594     else {
595         /* src == NULL && *saveState == NULL */
596         /* This shouldn't happen. We already finished tokenizing. */
597         return NULL;
598     }
599 
600     /* Skip initial delimiters */
601     nonDelimIdx = u_strspn(tokSource, delim);
602     tokSource = &tokSource[nonDelimIdx];
603 
604     if (*tokSource) {
605         nextToken = u_strpbrk(tokSource, delim);
606         if (nextToken != NULL) {
607             /* Create a token */
608             *(nextToken++) = 0;
609             *saveState = nextToken;
610             return tokSource;
611         }
612         else if (*saveState) {
613             /* Return the last token */
614             *saveState = NULL;
615             return tokSource;
616         }
617     }
618     else {
619         /* No tokens were found. Only delimiters were left. */
620         *saveState = NULL;
621     }
622     return NULL;
623 }
624 
625 /* Miscellaneous functions -------------------------------------------------- */
626 
627 U_CAPI UChar* U_EXPORT2
u_strcat(UChar * dst,const UChar * src)628 u_strcat(UChar     *dst,
629     const UChar     *src)
630 {
631     UChar *anchor = dst;            /* save a pointer to start of dst */
632 
633     while(*dst != 0) {              /* To end of first string          */
634         ++dst;
635     }
636     while((*(dst++) = *(src++)) != 0) {     /* copy string 2 over              */
637     }
638 
639     return anchor;
640 }
641 
642 U_CAPI UChar*  U_EXPORT2
u_strncat(UChar * dst,const UChar * src,int32_t n)643 u_strncat(UChar     *dst,
644      const UChar     *src,
645      int32_t     n )
646 {
647     if(n > 0) {
648         UChar *anchor = dst;            /* save a pointer to start of dst */
649 
650         while(*dst != 0) {              /* To end of first string          */
651             ++dst;
652         }
653         while((*dst = *src) != 0) {     /* copy string 2 over              */
654             ++dst;
655             if(--n == 0) {
656                 *dst = 0;
657                 break;
658             }
659             ++src;
660         }
661 
662         return anchor;
663     } else {
664         return dst;
665     }
666 }
667 
668 /* ----- Text property functions --- */
669 
670 U_CAPI int32_t   U_EXPORT2
u_strcmp(const UChar * s1,const UChar * s2)671 u_strcmp(const UChar *s1,
672     const UChar *s2)
673 {
674     UChar  c1, c2;
675 
676     for(;;) {
677         c1=*s1++;
678         c2=*s2++;
679         if (c1 != c2 || c1 == 0) {
680             break;
681         }
682     }
683     return (int32_t)c1 - (int32_t)c2;
684 }
685 
686 U_CFUNC int32_t U_EXPORT2
uprv_strCompare(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,UBool strncmpStyle,UBool codePointOrder)687 uprv_strCompare(const UChar *s1, int32_t length1,
688                 const UChar *s2, int32_t length2,
689                 UBool strncmpStyle, UBool codePointOrder) {
690     const UChar *start1, *start2, *limit1, *limit2;
691     UChar c1, c2;
692 
693     /* setup for fix-up */
694     start1=s1;
695     start2=s2;
696 
697     /* compare identical prefixes - they do not need to be fixed up */
698     if(length1<0 && length2<0) {
699         /* strcmp style, both NUL-terminated */
700         if(s1==s2) {
701             return 0;
702         }
703 
704         for(;;) {
705             c1=*s1;
706             c2=*s2;
707             if(c1!=c2) {
708                 break;
709             }
710             if(c1==0) {
711                 return 0;
712             }
713             ++s1;
714             ++s2;
715         }
716 
717         /* setup for fix-up */
718         limit1=limit2=NULL;
719     } else if(strncmpStyle) {
720         /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
721         if(s1==s2) {
722             return 0;
723         }
724 
725         limit1=start1+length1;
726 
727         for(;;) {
728             /* both lengths are same, check only one limit */
729             if(s1==limit1) {
730                 return 0;
731             }
732 
733             c1=*s1;
734             c2=*s2;
735             if(c1!=c2) {
736                 break;
737             }
738             if(c1==0) {
739                 return 0;
740             }
741             ++s1;
742             ++s2;
743         }
744 
745         /* setup for fix-up */
746         limit2=start2+length1; /* use length1 here, too, to enforce assumption */
747     } else {
748         /* memcmp/UnicodeString style, both length-specified */
749         int32_t lengthResult;
750 
751         if(length1<0) {
752             length1=u_strlen(s1);
753         }
754         if(length2<0) {
755             length2=u_strlen(s2);
756         }
757 
758         /* limit1=start1+min(lenght1, length2) */
759         if(length1<length2) {
760             lengthResult=-1;
761             limit1=start1+length1;
762         } else if(length1==length2) {
763             lengthResult=0;
764             limit1=start1+length1;
765         } else /* length1>length2 */ {
766             lengthResult=1;
767             limit1=start1+length2;
768         }
769 
770         if(s1==s2) {
771             return lengthResult;
772         }
773 
774         for(;;) {
775             /* check pseudo-limit */
776             if(s1==limit1) {
777                 return lengthResult;
778             }
779 
780             c1=*s1;
781             c2=*s2;
782             if(c1!=c2) {
783                 break;
784             }
785             ++s1;
786             ++s2;
787         }
788 
789         /* setup for fix-up */
790         limit1=start1+length1;
791         limit2=start2+length2;
792     }
793 
794     /* if both values are in or above the surrogate range, fix them up */
795     if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
796         /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
797         if(
798             (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
799             (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
800         ) {
801             /* part of a surrogate pair, leave >=d800 */
802         } else {
803             /* BMP code point - may be surrogate code point - make <d800 */
804             c1-=0x2800;
805         }
806 
807         if(
808             (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
809             (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
810         ) {
811             /* part of a surrogate pair, leave >=d800 */
812         } else {
813             /* BMP code point - may be surrogate code point - make <d800 */
814             c2-=0x2800;
815         }
816     }
817 
818     /* now c1 and c2 are in the requested (code unit or code point) order */
819     return (int32_t)c1-(int32_t)c2;
820 }
821 
822 /*
823  * Compare two strings as presented by UCharIterators.
824  * Use code unit or code point order.
825  * When the function returns, it is undefined where the iterators
826  * have stopped.
827  */
828 U_CAPI int32_t U_EXPORT2
u_strCompareIter(UCharIterator * iter1,UCharIterator * iter2,UBool codePointOrder)829 u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
830     UChar32 c1, c2;
831 
832     /* argument checking */
833     if(iter1==NULL || iter2==NULL) {
834         return 0; /* bad arguments */
835     }
836     if(iter1==iter2) {
837         return 0; /* identical iterators */
838     }
839 
840     /* reset iterators to start? */
841     iter1->move(iter1, 0, UITER_START);
842     iter2->move(iter2, 0, UITER_START);
843 
844     /* compare identical prefixes - they do not need to be fixed up */
845     for(;;) {
846         c1=iter1->next(iter1);
847         c2=iter2->next(iter2);
848         if(c1!=c2) {
849             break;
850         }
851         if(c1==-1) {
852             return 0;
853         }
854     }
855 
856     /* if both values are in or above the surrogate range, fix them up */
857     if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
858         /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
859         if(
860             (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
861             (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
862         ) {
863             /* part of a surrogate pair, leave >=d800 */
864         } else {
865             /* BMP code point - may be surrogate code point - make <d800 */
866             c1-=0x2800;
867         }
868 
869         if(
870             (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
871             (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
872         ) {
873             /* part of a surrogate pair, leave >=d800 */
874         } else {
875             /* BMP code point - may be surrogate code point - make <d800 */
876             c2-=0x2800;
877         }
878     }
879 
880     /* now c1 and c2 are in the requested (code unit or code point) order */
881     return (int32_t)c1-(int32_t)c2;
882 }
883 
884 #if 0
885 /*
886  * u_strCompareIter() does not leave the iterators _on_ the different units.
887  * This is possible but would cost a few extra indirect function calls to back
888  * up if the last unit (c1 or c2 respectively) was >=0.
889  *
890  * Consistently leaving them _behind_ the different units is not an option
891  * because the current "unit" is the end of the string if that is reached,
892  * and in such a case the iterator does not move.
893  * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
894  * of their strings. Calling previous() on each does not move them to where
895  * the comparison fails.
896  *
897  * So the simplest semantics is to not define where the iterators end up.
898  *
899  * The following fragment is part of what would need to be done for backing up.
900  */
901 void fragment {
902         /* iff a surrogate is part of a surrogate pair, leave >=d800 */
903         if(c1<=0xdbff) {
904             if(!U16_IS_TRAIL(iter1->current(iter1))) {
905                 /* lead surrogate code point - make <d800 */
906                 c1-=0x2800;
907             }
908         } else if(c1<=0xdfff) {
909             int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
910             iter1->previous(iter1); /* ==c1 */
911             if(!U16_IS_LEAD(iter1->previous(iter1))) {
912                 /* trail surrogate code point - make <d800 */
913                 c1-=0x2800;
914             }
915             /* go back to behind where the difference is */
916             iter1->move(iter1, idx, UITER_ZERO);
917         } else /* 0xe000<=c1<=0xffff */ {
918             /* BMP code point - make <d800 */
919             c1-=0x2800;
920         }
921 }
922 #endif
923 
924 U_CAPI int32_t U_EXPORT2
u_strCompare(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,UBool codePointOrder)925 u_strCompare(const UChar *s1, int32_t length1,
926              const UChar *s2, int32_t length2,
927              UBool codePointOrder) {
928     /* argument checking */
929     if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
930         return 0;
931     }
932     return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder);
933 }
934 
935 /* String compare in code point order - u_strcmp() compares in code unit order. */
936 U_CAPI int32_t U_EXPORT2
u_strcmpCodePointOrder(const UChar * s1,const UChar * s2)937 u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
938     return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE);
939 }
940 
941 U_CAPI int32_t   U_EXPORT2
u_strncmp(const UChar * s1,const UChar * s2,int32_t n)942 u_strncmp(const UChar     *s1,
943      const UChar     *s2,
944      int32_t     n)
945 {
946     if(n > 0) {
947         int32_t rc;
948         for(;;) {
949             rc = (int32_t)*s1 - (int32_t)*s2;
950             if(rc != 0 || *s1 == 0 || --n == 0) {
951                 return rc;
952             }
953             ++s1;
954             ++s2;
955         }
956     } else {
957         return 0;
958     }
959 }
960 
961 U_CAPI int32_t U_EXPORT2
u_strncmpCodePointOrder(const UChar * s1,const UChar * s2,int32_t n)962 u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) {
963     return uprv_strCompare(s1, n, s2, n, TRUE, TRUE);
964 }
965 
966 U_CAPI UChar* U_EXPORT2
u_strcpy(UChar * dst,const UChar * src)967 u_strcpy(UChar     *dst,
968     const UChar     *src)
969 {
970     UChar *anchor = dst;            /* save a pointer to start of dst */
971 
972     while((*(dst++) = *(src++)) != 0) {     /* copy string 2 over              */
973     }
974 
975     return anchor;
976 }
977 
978 U_CAPI UChar*  U_EXPORT2
u_strncpy(UChar * dst,const UChar * src,int32_t n)979 u_strncpy(UChar     *dst,
980      const UChar     *src,
981      int32_t     n)
982 {
983     UChar *anchor = dst;            /* save a pointer to start of dst */
984 
985     /* copy string 2 over */
986     while(n > 0 && (*(dst++) = *(src++)) != 0) {
987         --n;
988     }
989 
990     return anchor;
991 }
992 
993 U_CAPI int32_t   U_EXPORT2
u_strlen(const UChar * s)994 u_strlen(const UChar *s)
995 {
996 #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
997     return (int32_t)uprv_wcslen((const wchar_t *)s);
998 #else
999     const UChar *t = s;
1000     while(*t != 0) {
1001       ++t;
1002     }
1003     return t - s;
1004 #endif
1005 }
1006 
1007 U_CAPI int32_t U_EXPORT2
u_countChar32(const UChar * s,int32_t length)1008 u_countChar32(const UChar *s, int32_t length) {
1009     int32_t count;
1010 
1011     if(s==NULL || length<-1) {
1012         return 0;
1013     }
1014 
1015     count=0;
1016     if(length>=0) {
1017         while(length>0) {
1018             ++count;
1019             if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) {
1020                 s+=2;
1021                 length-=2;
1022             } else {
1023                 ++s;
1024                 --length;
1025             }
1026         }
1027     } else /* length==-1 */ {
1028         UChar c;
1029 
1030         for(;;) {
1031             if((c=*s++)==0) {
1032                 break;
1033             }
1034             ++count;
1035 
1036             /*
1037              * sufficient to look ahead one because of UTF-16;
1038              * safe to look ahead one because at worst that would be the terminating NUL
1039              */
1040             if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1041                 ++s;
1042             }
1043         }
1044     }
1045     return count;
1046 }
1047 
1048 U_CAPI UBool U_EXPORT2
u_strHasMoreChar32Than(const UChar * s,int32_t length,int32_t number)1049 u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) {
1050 
1051     if(number<0) {
1052         return TRUE;
1053     }
1054     if(s==NULL || length<-1) {
1055         return FALSE;
1056     }
1057 
1058     if(length==-1) {
1059         /* s is NUL-terminated */
1060         UChar c;
1061 
1062         /* count code points until they exceed */
1063         for(;;) {
1064             if((c=*s++)==0) {
1065                 return FALSE;
1066             }
1067             if(number==0) {
1068                 return TRUE;
1069             }
1070             if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1071                 ++s;
1072             }
1073             --number;
1074         }
1075     } else {
1076         /* length>=0 known */
1077         const UChar *limit;
1078         int32_t maxSupplementary;
1079 
1080         /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1081         if(((length+1)/2)>number) {
1082             return TRUE;
1083         }
1084 
1085         /* check if s does not even contain enough UChars */
1086         maxSupplementary=length-number;
1087         if(maxSupplementary<=0) {
1088             return FALSE;
1089         }
1090         /* there are maxSupplementary=length-number more UChars than asked-for code points */
1091 
1092         /*
1093          * count code points until they exceed and also check that there are
1094          * no more than maxSupplementary supplementary code points (UChar pairs)
1095          */
1096         limit=s+length;
1097         for(;;) {
1098             if(s==limit) {
1099                 return FALSE;
1100             }
1101             if(number==0) {
1102                 return TRUE;
1103             }
1104             if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
1105                 ++s;
1106                 if(--maxSupplementary<=0) {
1107                     /* too many pairs - too few code points */
1108                     return FALSE;
1109                 }
1110             }
1111             --number;
1112         }
1113     }
1114 }
1115 
1116 U_CAPI UChar * U_EXPORT2
u_memcpy(UChar * dest,const UChar * src,int32_t count)1117 u_memcpy(UChar *dest, const UChar *src, int32_t count) {
1118     if(count > 0) {
1119         uprv_memcpy(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1120     }
1121     return dest;
1122 }
1123 
1124 U_CAPI UChar * U_EXPORT2
u_memmove(UChar * dest,const UChar * src,int32_t count)1125 u_memmove(UChar *dest, const UChar *src, int32_t count) {
1126     if(count > 0) {
1127         uprv_memmove(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1128     }
1129     return dest;
1130 }
1131 
1132 U_CAPI UChar * U_EXPORT2
u_memset(UChar * dest,UChar c,int32_t count)1133 u_memset(UChar *dest, UChar c, int32_t count) {
1134     if(count > 0) {
1135         UChar *ptr = dest;
1136         UChar *limit = dest + count;
1137 
1138         while (ptr < limit) {
1139             *(ptr++) = c;
1140         }
1141     }
1142     return dest;
1143 }
1144 
1145 U_CAPI int32_t U_EXPORT2
u_memcmp(const UChar * buf1,const UChar * buf2,int32_t count)1146 u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) {
1147     if(count > 0) {
1148         const UChar *limit = buf1 + count;
1149         int32_t result;
1150 
1151         while (buf1 < limit) {
1152             result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
1153             if (result != 0) {
1154                 return result;
1155             }
1156             buf1++;
1157             buf2++;
1158         }
1159     }
1160     return 0;
1161 }
1162 
1163 U_CAPI int32_t U_EXPORT2
u_memcmpCodePointOrder(const UChar * s1,const UChar * s2,int32_t count)1164 u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
1165     return uprv_strCompare(s1, count, s2, count, FALSE, TRUE);
1166 }
1167 
1168 /* u_unescape & support fns ------------------------------------------------- */
1169 
1170 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1171 static const UChar UNESCAPE_MAP[] = {
1172     /*"   0x22, 0x22 */
1173     /*'   0x27, 0x27 */
1174     /*?   0x3F, 0x3F */
1175     /*\   0x5C, 0x5C */
1176     /*a*/ 0x61, 0x07,
1177     /*b*/ 0x62, 0x08,
1178     /*e*/ 0x65, 0x1b,
1179     /*f*/ 0x66, 0x0c,
1180     /*n*/ 0x6E, 0x0a,
1181     /*r*/ 0x72, 0x0d,
1182     /*t*/ 0x74, 0x09,
1183     /*v*/ 0x76, 0x0b
1184 };
1185 enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
1186 
1187 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
_digit8(UChar c)1188 static int8_t _digit8(UChar c) {
1189     if (c >= 0x0030 && c <= 0x0037) {
1190         return (int8_t)(c - 0x0030);
1191     }
1192     return -1;
1193 }
1194 
1195 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
_digit16(UChar c)1196 static int8_t _digit16(UChar c) {
1197     if (c >= 0x0030 && c <= 0x0039) {
1198         return (int8_t)(c - 0x0030);
1199     }
1200     if (c >= 0x0041 && c <= 0x0046) {
1201         return (int8_t)(c - (0x0041 - 10));
1202     }
1203     if (c >= 0x0061 && c <= 0x0066) {
1204         return (int8_t)(c - (0x0061 - 10));
1205     }
1206     return -1;
1207 }
1208 
1209 /* Parse a single escape sequence.  Although this method deals in
1210  * UChars, it does not use C++ or UnicodeString.  This allows it to
1211  * be used from C contexts. */
1212 U_CAPI UChar32 U_EXPORT2
u_unescapeAt(UNESCAPE_CHAR_AT charAt,int32_t * offset,int32_t length,void * context)1213 u_unescapeAt(UNESCAPE_CHAR_AT charAt,
1214              int32_t *offset,
1215              int32_t length,
1216              void *context) {
1217 
1218     int32_t start = *offset;
1219     UChar c;
1220     UChar32 result = 0;
1221     int8_t n = 0;
1222     int8_t minDig = 0;
1223     int8_t maxDig = 0;
1224     int8_t bitsPerDigit = 4;
1225     int8_t dig;
1226     int32_t i;
1227     UBool braces = FALSE;
1228 
1229     /* Check that offset is in range */
1230     if (*offset < 0 || *offset >= length) {
1231         goto err;
1232     }
1233 
1234     /* Fetch first UChar after '\\' */
1235     c = charAt((*offset)++, context);
1236 
1237     /* Convert hexadecimal and octal escapes */
1238     switch (c) {
1239     case 0x0075 /*'u'*/:
1240         minDig = maxDig = 4;
1241         break;
1242     case 0x0055 /*'U'*/:
1243         minDig = maxDig = 8;
1244         break;
1245     case 0x0078 /*'x'*/:
1246         minDig = 1;
1247         if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
1248             ++(*offset);
1249             braces = TRUE;
1250             maxDig = 8;
1251         } else {
1252             maxDig = 2;
1253         }
1254         break;
1255     default:
1256         dig = _digit8(c);
1257         if (dig >= 0) {
1258             minDig = 1;
1259             maxDig = 3;
1260             n = 1; /* Already have first octal digit */
1261             bitsPerDigit = 3;
1262             result = dig;
1263         }
1264         break;
1265     }
1266     if (minDig != 0) {
1267         while (*offset < length && n < maxDig) {
1268             c = charAt(*offset, context);
1269             dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
1270             if (dig < 0) {
1271                 break;
1272             }
1273             result = (result << bitsPerDigit) | dig;
1274             ++(*offset);
1275             ++n;
1276         }
1277         if (n < minDig) {
1278             goto err;
1279         }
1280         if (braces) {
1281             if (c != 0x7D /*}*/) {
1282                 goto err;
1283             }
1284             ++(*offset);
1285         }
1286         if (result < 0 || result >= 0x110000) {
1287             goto err;
1288         }
1289         /* If an escape sequence specifies a lead surrogate, see if
1290          * there is a trail surrogate after it, either as an escape or
1291          * as a literal.  If so, join them up into a supplementary.
1292          */
1293         if (*offset < length && U16_IS_LEAD(result)) {
1294             int32_t ahead = *offset + 1;
1295             c = charAt(*offset, context);
1296             if (c == 0x5C /*'\\'*/ && ahead < length) {
1297                 c = (UChar) u_unescapeAt(charAt, &ahead, length, context);
1298             }
1299             if (U16_IS_TRAIL(c)) {
1300                 *offset = ahead;
1301                 result = U16_GET_SUPPLEMENTARY(result, c);
1302             }
1303         }
1304         return result;
1305     }
1306 
1307     /* Convert C-style escapes in table */
1308     for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
1309         if (c == UNESCAPE_MAP[i]) {
1310             return UNESCAPE_MAP[i+1];
1311         } else if (c < UNESCAPE_MAP[i]) {
1312             break;
1313         }
1314     }
1315 
1316     /* Map \cX to control-X: X & 0x1F */
1317     if (c == 0x0063 /*'c'*/ && *offset < length) {
1318         c = charAt((*offset)++, context);
1319         if (U16_IS_LEAD(c) && *offset < length) {
1320             UChar c2 = charAt(*offset, context);
1321             if (U16_IS_TRAIL(c2)) {
1322                 ++(*offset);
1323                 c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */
1324             }
1325         }
1326         return 0x1F & c;
1327     }
1328 
1329     /* If no special forms are recognized, then consider
1330      * the backslash to generically escape the next character.
1331      * Deal with surrogate pairs. */
1332     if (U16_IS_LEAD(c) && *offset < length) {
1333         UChar c2 = charAt(*offset, context);
1334         if (U16_IS_TRAIL(c2)) {
1335             ++(*offset);
1336             return U16_GET_SUPPLEMENTARY(c, c2);
1337         }
1338     }
1339     return c;
1340 
1341  err:
1342     /* Invalid escape sequence */
1343     *offset = start; /* Reset to initial value */
1344     return (UChar32)0xFFFFFFFF;
1345 }
1346 
1347 /* u_unescapeAt() callback to return a UChar from a char* */
1348 static UChar U_CALLCONV
_charPtr_charAt(int32_t offset,void * context)1349 _charPtr_charAt(int32_t offset, void *context) {
1350     UChar c16;
1351     /* It would be more efficient to access the invariant tables
1352      * directly but there is no API for that. */
1353     u_charsToUChars(((char*) context) + offset, &c16, 1);
1354     return c16;
1355 }
1356 
1357 /* Append an escape-free segment of the text; used by u_unescape() */
_appendUChars(UChar * dest,int32_t destCapacity,const char * src,int32_t srcLen)1358 static void _appendUChars(UChar *dest, int32_t destCapacity,
1359                           const char *src, int32_t srcLen) {
1360     if (destCapacity < 0) {
1361         destCapacity = 0;
1362     }
1363     if (srcLen > destCapacity) {
1364         srcLen = destCapacity;
1365     }
1366     u_charsToUChars(src, dest, srcLen);
1367 }
1368 
1369 /* Do an invariant conversion of char* -> UChar*, with escape parsing */
1370 U_CAPI int32_t U_EXPORT2
u_unescape(const char * src,UChar * dest,int32_t destCapacity)1371 u_unescape(const char *src, UChar *dest, int32_t destCapacity) {
1372     const char *segment = src;
1373     int32_t i = 0;
1374     char c;
1375 
1376     while ((c=*src) != 0) {
1377         /* '\\' intentionally written as compiler-specific
1378          * character constant to correspond to compiler-specific
1379          * char* constants. */
1380         if (c == '\\') {
1381             int32_t lenParsed = 0;
1382             UChar32 c32;
1383             if (src != segment) {
1384                 if (dest != NULL) {
1385                     _appendUChars(dest + i, destCapacity - i,
1386                                   segment, (int32_t)(src - segment));
1387                 }
1388                 i += (int32_t)(src - segment);
1389             }
1390             ++src; /* advance past '\\' */
1391             c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src);
1392             if (lenParsed == 0) {
1393                 goto err;
1394             }
1395             src += lenParsed; /* advance past escape seq. */
1396             if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) {
1397                 U16_APPEND_UNSAFE(dest, i, c32);
1398             } else {
1399                 i += U16_LENGTH(c32);
1400             }
1401             segment = src;
1402         } else {
1403             ++src;
1404         }
1405     }
1406     if (src != segment) {
1407         if (dest != NULL) {
1408             _appendUChars(dest + i, destCapacity - i,
1409                           segment, (int32_t)(src - segment));
1410         }
1411         i += (int32_t)(src - segment);
1412     }
1413     if (dest != NULL && i < destCapacity) {
1414         dest[i] = 0;
1415     }
1416     return i;
1417 
1418  err:
1419     if (dest != NULL && destCapacity > 0) {
1420         *dest = 0;
1421     }
1422     return 0;
1423 }
1424 
1425 /* NUL-termination of strings ----------------------------------------------- */
1426 
1427 /**
1428  * NUL-terminate a string no matter what its type.
1429  * Set warning and error codes accordingly.
1430  */
1431 #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode)      \
1432     if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) {                    \
1433         /* not a public function, so no complete argument checking */   \
1434                                                                         \
1435         if(length<0) {                                                  \
1436             /* assume that the caller handles this */                   \
1437         } else if(length<destCapacity) {                                \
1438             /* NUL-terminate the string, the NUL fits */                \
1439             dest[length]=0;                                             \
1440             /* unset the not-terminated warning but leave all others */ \
1441             if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {          \
1442                 *pErrorCode=U_ZERO_ERROR;                               \
1443             }                                                           \
1444         } else if(length==destCapacity) {                               \
1445             /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1446             *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;                \
1447         } else /* length>destCapacity */ {                              \
1448             /* even the string itself did not fit - set an error code */ \
1449             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;                        \
1450         }                                                               \
1451     }
1452 
1453 U_CAPI int32_t U_EXPORT2
u_terminateUChars(UChar * dest,int32_t destCapacity,int32_t length,UErrorCode * pErrorCode)1454 u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1455     __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1456     return length;
1457 }
1458 
1459 U_CAPI int32_t U_EXPORT2
u_terminateChars(char * dest,int32_t destCapacity,int32_t length,UErrorCode * pErrorCode)1460 u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1461     __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1462     return length;
1463 }
1464 
1465 U_CAPI int32_t U_EXPORT2
u_terminateUChar32s(UChar32 * dest,int32_t destCapacity,int32_t length,UErrorCode * pErrorCode)1466 u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1467     __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1468     return length;
1469 }
1470 
1471 U_CAPI int32_t U_EXPORT2
u_terminateWChars(wchar_t * dest,int32_t destCapacity,int32_t length,UErrorCode * pErrorCode)1472 u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1473     __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1474     return length;
1475 }
1476 
1477 // Compute the hash code for a string -------------------------------------- ***
1478 
1479 // Moved here from uhash.c so that UnicodeString::hashCode() does not depend
1480 // on UHashtable code.
1481 
1482 /*
1483   Compute the hash by iterating sparsely over about 32 (up to 63)
1484   characters spaced evenly through the string.  For each character,
1485   multiply the previous hash value by a prime number and add the new
1486   character in, like a linear congruential random number generator,
1487   producing a pseudorandom deterministic value well distributed over
1488   the output range. [LIU]
1489 */
1490 
1491 #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \
1492     uint32_t hash = 0;                        \
1493     const TYPE *p = (const TYPE*) STR;        \
1494     if (p != NULL) {                          \
1495         int32_t len = (int32_t)(STRLEN);      \
1496         int32_t inc = ((len - 32) / 32) + 1;  \
1497         const TYPE *limit = p + len;          \
1498         while (p<limit) {                     \
1499             hash = (hash * 37) + DEREF;       \
1500             p += inc;                         \
1501         }                                     \
1502     }                                         \
1503     return static_cast<int32_t>(hash)
1504 
1505 /* Used by UnicodeString to compute its hashcode - Not public API. */
1506 U_CAPI int32_t U_EXPORT2
ustr_hashUCharsN(const UChar * str,int32_t length)1507 ustr_hashUCharsN(const UChar *str, int32_t length) {
1508     STRING_HASH(UChar, str, length, *p);
1509 }
1510 
1511 U_CAPI int32_t U_EXPORT2
ustr_hashCharsN(const char * str,int32_t length)1512 ustr_hashCharsN(const char *str, int32_t length) {
1513     STRING_HASH(uint8_t, str, length, *p);
1514 }
1515 
1516 U_CAPI int32_t U_EXPORT2
ustr_hashICharsN(const char * str,int32_t length)1517 ustr_hashICharsN(const char *str, int32_t length) {
1518     STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p));
1519 }
1520