1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 1998-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 *
11 * File ustring.cpp
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 12/07/98 bertrand Creation.
17 ******************************************************************************
18 */
19
20 #include "unicode/utypes.h"
21 #include "unicode/putil.h"
22 #include "unicode/ustring.h"
23 #include "unicode/utf16.h"
24 #include "cstring.h"
25 #include "cwchar.h"
26 #include "cmemory.h"
27 #include "ustr_imp.h"
28
29 /* ANSI string.h - style functions ------------------------------------------ */
30
31 /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
32 #define U_BMP_MAX 0xffff
33
34 /* Forward binary string search functions ----------------------------------- */
35
36 /*
37 * Test if a substring match inside a string is at code point boundaries.
38 * All pointers refer to the same buffer.
39 * The limit pointer may be NULL, all others must be real pointers.
40 */
41 static inline UBool
isMatchAtCPBoundary(const UChar * start,const UChar * match,const UChar * matchLimit,const UChar * limit)42 isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) {
43 if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
44 /* the leading edge of the match is in the middle of a surrogate pair */
45 return FALSE;
46 }
47 if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) {
48 /* the trailing edge of the match is in the middle of a surrogate pair */
49 return FALSE;
50 }
51 return TRUE;
52 }
53
54 U_CAPI UChar * U_EXPORT2
u_strFindFirst(const UChar * s,int32_t length,const UChar * sub,int32_t subLength)55 u_strFindFirst(const UChar *s, int32_t length,
56 const UChar *sub, int32_t subLength) {
57 const UChar *start, *p, *q, *subLimit;
58 UChar c, cs, cq;
59
60 if(sub==NULL || subLength<-1) {
61 return (UChar *)s;
62 }
63 if(s==NULL || length<-1) {
64 return NULL;
65 }
66
67 start=s;
68
69 if(length<0 && subLength<0) {
70 /* both strings are NUL-terminated */
71 if((cs=*sub++)==0) {
72 return (UChar *)s;
73 }
74 if(*sub==0 && !U16_IS_SURROGATE(cs)) {
75 /* the substring consists of a single, non-surrogate BMP code point */
76 return u_strchr(s, cs);
77 }
78
79 while((c=*s++)!=0) {
80 if(c==cs) {
81 /* found first substring UChar, compare rest */
82 p=s;
83 q=sub;
84 for(;;) {
85 if((cq=*q)==0) {
86 if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
87 return (UChar *)(s-1); /* well-formed match */
88 } else {
89 break; /* no match because surrogate pair is split */
90 }
91 }
92 if((c=*p)==0) {
93 return NULL; /* no match, and none possible after s */
94 }
95 if(c!=cq) {
96 break; /* no match */
97 }
98 ++p;
99 ++q;
100 }
101 }
102 }
103
104 /* not found */
105 return NULL;
106 }
107
108 if(subLength<0) {
109 subLength=u_strlen(sub);
110 }
111 if(subLength==0) {
112 return (UChar *)s;
113 }
114
115 /* get sub[0] to search for it fast */
116 cs=*sub++;
117 --subLength;
118 subLimit=sub+subLength;
119
120 if(subLength==0 && !U16_IS_SURROGATE(cs)) {
121 /* the substring consists of a single, non-surrogate BMP code point */
122 return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
123 }
124
125 if(length<0) {
126 /* s is NUL-terminated */
127 while((c=*s++)!=0) {
128 if(c==cs) {
129 /* found first substring UChar, compare rest */
130 p=s;
131 q=sub;
132 for(;;) {
133 if(q==subLimit) {
134 if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
135 return (UChar *)(s-1); /* well-formed match */
136 } else {
137 break; /* no match because surrogate pair is split */
138 }
139 }
140 if((c=*p)==0) {
141 return NULL; /* no match, and none possible after s */
142 }
143 if(c!=*q) {
144 break; /* no match */
145 }
146 ++p;
147 ++q;
148 }
149 }
150 }
151 } else {
152 const UChar *limit, *preLimit;
153
154 /* subLength was decremented above */
155 if(length<=subLength) {
156 return NULL; /* s is shorter than sub */
157 }
158
159 limit=s+length;
160
161 /* the substring must start before preLimit */
162 preLimit=limit-subLength;
163
164 while(s!=preLimit) {
165 c=*s++;
166 if(c==cs) {
167 /* found first substring UChar, compare rest */
168 p=s;
169 q=sub;
170 for(;;) {
171 if(q==subLimit) {
172 if(isMatchAtCPBoundary(start, s-1, p, limit)) {
173 return (UChar *)(s-1); /* well-formed match */
174 } else {
175 break; /* no match because surrogate pair is split */
176 }
177 }
178 if(*p!=*q) {
179 break; /* no match */
180 }
181 ++p;
182 ++q;
183 }
184 }
185 }
186 }
187
188 /* not found */
189 return NULL;
190 }
191
192 U_CAPI UChar * U_EXPORT2
u_strstr(const UChar * s,const UChar * substring)193 u_strstr(const UChar *s, const UChar *substring) {
194 return u_strFindFirst(s, -1, substring, -1);
195 }
196
197 U_CAPI UChar * U_EXPORT2
u_strchr(const UChar * s,UChar c)198 u_strchr(const UChar *s, UChar c) {
199 if(U16_IS_SURROGATE(c)) {
200 /* make sure to not find half of a surrogate pair */
201 return u_strFindFirst(s, -1, &c, 1);
202 } else {
203 UChar cs;
204
205 /* trivial search for a BMP code point */
206 for(;;) {
207 if((cs=*s)==c) {
208 return (UChar *)s;
209 }
210 if(cs==0) {
211 return NULL;
212 }
213 ++s;
214 }
215 }
216 }
217
218 U_CAPI UChar * U_EXPORT2
u_strchr32(const UChar * s,UChar32 c)219 u_strchr32(const UChar *s, UChar32 c) {
220 if((uint32_t)c<=U_BMP_MAX) {
221 /* find BMP code point */
222 return u_strchr(s, (UChar)c);
223 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
224 /* find supplementary code point as surrogate pair */
225 UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
226
227 while((cs=*s++)!=0) {
228 if(cs==lead && *s==trail) {
229 return (UChar *)(s-1);
230 }
231 }
232 return NULL;
233 } else {
234 /* not a Unicode code point, not findable */
235 return NULL;
236 }
237 }
238
239 U_CAPI UChar * U_EXPORT2
u_memchr(const UChar * s,UChar c,int32_t count)240 u_memchr(const UChar *s, UChar c, int32_t count) {
241 if(count<=0) {
242 return NULL; /* no string */
243 } else if(U16_IS_SURROGATE(c)) {
244 /* make sure to not find half of a surrogate pair */
245 return u_strFindFirst(s, count, &c, 1);
246 } else {
247 /* trivial search for a BMP code point */
248 const UChar *limit=s+count;
249 do {
250 if(*s==c) {
251 return (UChar *)s;
252 }
253 } while(++s!=limit);
254 return NULL;
255 }
256 }
257
258 U_CAPI UChar * U_EXPORT2
u_memchr32(const UChar * s,UChar32 c,int32_t count)259 u_memchr32(const UChar *s, UChar32 c, int32_t count) {
260 if((uint32_t)c<=U_BMP_MAX) {
261 /* find BMP code point */
262 return u_memchr(s, (UChar)c, count);
263 } else if(count<2) {
264 /* too short for a surrogate pair */
265 return NULL;
266 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
267 /* find supplementary code point as surrogate pair */
268 const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
269 UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
270
271 do {
272 if(*s==lead && *(s+1)==trail) {
273 return (UChar *)s;
274 }
275 } while(++s!=limit);
276 return NULL;
277 } else {
278 /* not a Unicode code point, not findable */
279 return NULL;
280 }
281 }
282
283 /* Backward binary string search functions ---------------------------------- */
284
285 U_CAPI UChar * U_EXPORT2
u_strFindLast(const UChar * s,int32_t length,const UChar * sub,int32_t subLength)286 u_strFindLast(const UChar *s, int32_t length,
287 const UChar *sub, int32_t subLength) {
288 const UChar *start, *limit, *p, *q, *subLimit;
289 UChar c, cs;
290
291 if(sub==NULL || subLength<-1) {
292 return (UChar *)s;
293 }
294 if(s==NULL || length<-1) {
295 return NULL;
296 }
297
298 /*
299 * This implementation is more lazy than the one for u_strFindFirst():
300 * There is no special search code for NUL-terminated strings.
301 * It does not seem to be worth it for searching substrings to
302 * search forward and find all matches like in u_strrchr() and similar.
303 * Therefore, we simply get both string lengths and search backward.
304 *
305 * markus 2002oct23
306 */
307
308 if(subLength<0) {
309 subLength=u_strlen(sub);
310 }
311 if(subLength==0) {
312 return (UChar *)s;
313 }
314
315 /* get sub[subLength-1] to search for it fast */
316 subLimit=sub+subLength;
317 cs=*(--subLimit);
318 --subLength;
319
320 if(subLength==0 && !U16_IS_SURROGATE(cs)) {
321 /* the substring consists of a single, non-surrogate BMP code point */
322 return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
323 }
324
325 if(length<0) {
326 length=u_strlen(s);
327 }
328
329 /* subLength was decremented above */
330 if(length<=subLength) {
331 return NULL; /* s is shorter than sub */
332 }
333
334 start=s;
335 limit=s+length;
336
337 /* the substring must start no later than s+subLength */
338 s+=subLength;
339
340 while(s!=limit) {
341 c=*(--limit);
342 if(c==cs) {
343 /* found last substring UChar, compare rest */
344 p=limit;
345 q=subLimit;
346 for(;;) {
347 if(q==sub) {
348 if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
349 return (UChar *)p; /* well-formed match */
350 } else {
351 break; /* no match because surrogate pair is split */
352 }
353 }
354 if(*(--p)!=*(--q)) {
355 break; /* no match */
356 }
357 }
358 }
359 }
360
361 /* not found */
362 return NULL;
363 }
364
365 U_CAPI UChar * U_EXPORT2
u_strrstr(const UChar * s,const UChar * substring)366 u_strrstr(const UChar *s, const UChar *substring) {
367 return u_strFindLast(s, -1, substring, -1);
368 }
369
370 U_CAPI UChar * U_EXPORT2
u_strrchr(const UChar * s,UChar c)371 u_strrchr(const UChar *s, UChar c) {
372 if(U16_IS_SURROGATE(c)) {
373 /* make sure to not find half of a surrogate pair */
374 return u_strFindLast(s, -1, &c, 1);
375 } else {
376 const UChar *result=NULL;
377 UChar cs;
378
379 /* trivial search for a BMP code point */
380 for(;;) {
381 if((cs=*s)==c) {
382 result=s;
383 }
384 if(cs==0) {
385 return (UChar *)result;
386 }
387 ++s;
388 }
389 }
390 }
391
392 U_CAPI UChar * U_EXPORT2
u_strrchr32(const UChar * s,UChar32 c)393 u_strrchr32(const UChar *s, UChar32 c) {
394 if((uint32_t)c<=U_BMP_MAX) {
395 /* find BMP code point */
396 return u_strrchr(s, (UChar)c);
397 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
398 /* find supplementary code point as surrogate pair */
399 const UChar *result=NULL;
400 UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
401
402 while((cs=*s++)!=0) {
403 if(cs==lead && *s==trail) {
404 result=s-1;
405 }
406 }
407 return (UChar *)result;
408 } else {
409 /* not a Unicode code point, not findable */
410 return NULL;
411 }
412 }
413
414 U_CAPI UChar * U_EXPORT2
u_memrchr(const UChar * s,UChar c,int32_t count)415 u_memrchr(const UChar *s, UChar c, int32_t count) {
416 if(count<=0) {
417 return NULL; /* no string */
418 } else if(U16_IS_SURROGATE(c)) {
419 /* make sure to not find half of a surrogate pair */
420 return u_strFindLast(s, count, &c, 1);
421 } else {
422 /* trivial search for a BMP code point */
423 const UChar *limit=s+count;
424 do {
425 if(*(--limit)==c) {
426 return (UChar *)limit;
427 }
428 } while(s!=limit);
429 return NULL;
430 }
431 }
432
433 U_CAPI UChar * U_EXPORT2
u_memrchr32(const UChar * s,UChar32 c,int32_t count)434 u_memrchr32(const UChar *s, UChar32 c, int32_t count) {
435 if((uint32_t)c<=U_BMP_MAX) {
436 /* find BMP code point */
437 return u_memrchr(s, (UChar)c, count);
438 } else if(count<2) {
439 /* too short for a surrogate pair */
440 return NULL;
441 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
442 /* find supplementary code point as surrogate pair */
443 const UChar *limit=s+count-1;
444 UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
445
446 do {
447 if(*limit==trail && *(limit-1)==lead) {
448 return (UChar *)(limit-1);
449 }
450 } while(s!=--limit);
451 return NULL;
452 } else {
453 /* not a Unicode code point, not findable */
454 return NULL;
455 }
456 }
457
458 /* Tokenization functions --------------------------------------------------- */
459
460 /*
461 * Match each code point in a string against each code point in the matchSet.
462 * Return the index of the first string code point that
463 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
464 * Return -(string length)-1 if there is no such code point.
465 */
466 static int32_t
_matchFromSet(const UChar * string,const UChar * matchSet,UBool polarity)467 _matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) {
468 int32_t matchLen, matchBMPLen, strItr, matchItr;
469 UChar32 stringCh, matchCh;
470 UChar c, c2;
471
472 /* first part of matchSet contains only BMP code points */
473 matchBMPLen = 0;
474 while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
475 ++matchBMPLen;
476 }
477
478 /* second part of matchSet contains BMP and supplementary code points */
479 matchLen = matchBMPLen;
480 while(matchSet[matchLen] != 0) {
481 ++matchLen;
482 }
483
484 for(strItr = 0; (c = string[strItr]) != 0;) {
485 ++strItr;
486 if(U16_IS_SINGLE(c)) {
487 if(polarity) {
488 for(matchItr = 0; matchItr < matchLen; ++matchItr) {
489 if(c == matchSet[matchItr]) {
490 return strItr - 1; /* one matches */
491 }
492 }
493 } else {
494 for(matchItr = 0; matchItr < matchLen; ++matchItr) {
495 if(c == matchSet[matchItr]) {
496 goto endloop;
497 }
498 }
499 return strItr - 1; /* none matches */
500 }
501 } else {
502 /*
503 * No need to check for string length before U16_IS_TRAIL
504 * because c2 could at worst be the terminating NUL.
505 */
506 if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
507 ++strItr;
508 stringCh = U16_GET_SUPPLEMENTARY(c, c2);
509 } else {
510 stringCh = c; /* unpaired trail surrogate */
511 }
512
513 if(polarity) {
514 for(matchItr = matchBMPLen; matchItr < matchLen;) {
515 U16_NEXT(matchSet, matchItr, matchLen, matchCh);
516 if(stringCh == matchCh) {
517 return strItr - U16_LENGTH(stringCh); /* one matches */
518 }
519 }
520 } else {
521 for(matchItr = matchBMPLen; matchItr < matchLen;) {
522 U16_NEXT(matchSet, matchItr, matchLen, matchCh);
523 if(stringCh == matchCh) {
524 goto endloop;
525 }
526 }
527 return strItr - U16_LENGTH(stringCh); /* none matches */
528 }
529 }
530 endloop:
531 /* wish C had continue with labels like Java... */;
532 }
533
534 /* Didn't find it. */
535 return -strItr-1;
536 }
537
538 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
539 U_CAPI UChar * U_EXPORT2
u_strpbrk(const UChar * string,const UChar * matchSet)540 u_strpbrk(const UChar *string, const UChar *matchSet)
541 {
542 int32_t idx = _matchFromSet(string, matchSet, TRUE);
543 if(idx >= 0) {
544 return (UChar *)string + idx;
545 } else {
546 return NULL;
547 }
548 }
549
550 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
551 U_CAPI int32_t U_EXPORT2
u_strcspn(const UChar * string,const UChar * matchSet)552 u_strcspn(const UChar *string, const UChar *matchSet)
553 {
554 int32_t idx = _matchFromSet(string, matchSet, TRUE);
555 if(idx >= 0) {
556 return idx;
557 } else {
558 return -idx - 1; /* == u_strlen(string) */
559 }
560 }
561
562 /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
563 U_CAPI int32_t U_EXPORT2
u_strspn(const UChar * string,const UChar * matchSet)564 u_strspn(const UChar *string, const UChar *matchSet)
565 {
566 int32_t idx = _matchFromSet(string, matchSet, FALSE);
567 if(idx >= 0) {
568 return idx;
569 } else {
570 return -idx - 1; /* == u_strlen(string) */
571 }
572 }
573
574 /* ----- Text manipulation functions --- */
575
576 U_CAPI UChar* U_EXPORT2
u_strtok_r(UChar * src,const UChar * delim,UChar ** saveState)577 u_strtok_r(UChar *src,
578 const UChar *delim,
579 UChar **saveState)
580 {
581 UChar *tokSource;
582 UChar *nextToken;
583 uint32_t nonDelimIdx;
584
585 /* If saveState is NULL, the user messed up. */
586 if (src != NULL) {
587 tokSource = src;
588 *saveState = src; /* Set to "src" in case there are no delimiters */
589 }
590 else if (*saveState) {
591 tokSource = *saveState;
592 }
593 else {
594 /* src == NULL && *saveState == NULL */
595 /* This shouldn't happen. We already finished tokenizing. */
596 return NULL;
597 }
598
599 /* Skip initial delimiters */
600 nonDelimIdx = u_strspn(tokSource, delim);
601 tokSource = &tokSource[nonDelimIdx];
602
603 if (*tokSource) {
604 nextToken = u_strpbrk(tokSource, delim);
605 if (nextToken != NULL) {
606 /* Create a token */
607 *(nextToken++) = 0;
608 *saveState = nextToken;
609 return tokSource;
610 }
611 else if (*saveState) {
612 /* Return the last token */
613 *saveState = NULL;
614 return tokSource;
615 }
616 }
617 else {
618 /* No tokens were found. Only delimiters were left. */
619 *saveState = NULL;
620 }
621 return NULL;
622 }
623
624 /* Miscellaneous functions -------------------------------------------------- */
625
626 U_CAPI UChar* U_EXPORT2
u_strcat(UChar * dst,const UChar * src)627 u_strcat(UChar *dst,
628 const UChar *src)
629 {
630 UChar *anchor = dst; /* save a pointer to start of dst */
631
632 while(*dst != 0) { /* To end of first string */
633 ++dst;
634 }
635 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
636 }
637
638 return anchor;
639 }
640
641 U_CAPI UChar* U_EXPORT2
u_strncat(UChar * dst,const UChar * src,int32_t n)642 u_strncat(UChar *dst,
643 const UChar *src,
644 int32_t n )
645 {
646 if(n > 0) {
647 UChar *anchor = dst; /* save a pointer to start of dst */
648
649 while(*dst != 0) { /* To end of first string */
650 ++dst;
651 }
652 while((*dst = *src) != 0) { /* copy string 2 over */
653 ++dst;
654 if(--n == 0) {
655 *dst = 0;
656 break;
657 }
658 ++src;
659 }
660
661 return anchor;
662 } else {
663 return dst;
664 }
665 }
666
667 /* ----- Text property functions --- */
668
669 U_CAPI int32_t U_EXPORT2
u_strcmp(const UChar * s1,const UChar * s2)670 u_strcmp(const UChar *s1,
671 const UChar *s2)
672 {
673 UChar c1, c2;
674
675 for(;;) {
676 c1=*s1++;
677 c2=*s2++;
678 if (c1 != c2 || c1 == 0) {
679 break;
680 }
681 }
682 return (int32_t)c1 - (int32_t)c2;
683 }
684
685 U_CFUNC int32_t U_EXPORT2
uprv_strCompare(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,UBool strncmpStyle,UBool codePointOrder)686 uprv_strCompare(const UChar *s1, int32_t length1,
687 const UChar *s2, int32_t length2,
688 UBool strncmpStyle, UBool codePointOrder) {
689 const UChar *start1, *start2, *limit1, *limit2;
690 UChar c1, c2;
691
692 /* setup for fix-up */
693 start1=s1;
694 start2=s2;
695
696 /* compare identical prefixes - they do not need to be fixed up */
697 if(length1<0 && length2<0) {
698 /* strcmp style, both NUL-terminated */
699 if(s1==s2) {
700 return 0;
701 }
702
703 for(;;) {
704 c1=*s1;
705 c2=*s2;
706 if(c1!=c2) {
707 break;
708 }
709 if(c1==0) {
710 return 0;
711 }
712 ++s1;
713 ++s2;
714 }
715
716 /* setup for fix-up */
717 limit1=limit2=NULL;
718 } else if(strncmpStyle) {
719 /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
720 if(s1==s2) {
721 return 0;
722 }
723
724 limit1=start1+length1;
725
726 for(;;) {
727 /* both lengths are same, check only one limit */
728 if(s1==limit1) {
729 return 0;
730 }
731
732 c1=*s1;
733 c2=*s2;
734 if(c1!=c2) {
735 break;
736 }
737 if(c1==0) {
738 return 0;
739 }
740 ++s1;
741 ++s2;
742 }
743
744 /* setup for fix-up */
745 limit2=start2+length1; /* use length1 here, too, to enforce assumption */
746 } else {
747 /* memcmp/UnicodeString style, both length-specified */
748 int32_t lengthResult;
749
750 if(length1<0) {
751 length1=u_strlen(s1);
752 }
753 if(length2<0) {
754 length2=u_strlen(s2);
755 }
756
757 /* limit1=start1+min(lenght1, length2) */
758 if(length1<length2) {
759 lengthResult=-1;
760 limit1=start1+length1;
761 } else if(length1==length2) {
762 lengthResult=0;
763 limit1=start1+length1;
764 } else /* length1>length2 */ {
765 lengthResult=1;
766 limit1=start1+length2;
767 }
768
769 if(s1==s2) {
770 return lengthResult;
771 }
772
773 for(;;) {
774 /* check pseudo-limit */
775 if(s1==limit1) {
776 return lengthResult;
777 }
778
779 c1=*s1;
780 c2=*s2;
781 if(c1!=c2) {
782 break;
783 }
784 ++s1;
785 ++s2;
786 }
787
788 /* setup for fix-up */
789 limit1=start1+length1;
790 limit2=start2+length2;
791 }
792
793 /* if both values are in or above the surrogate range, fix them up */
794 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
795 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
796 if(
797 (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
798 (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
799 ) {
800 /* part of a surrogate pair, leave >=d800 */
801 } else {
802 /* BMP code point - may be surrogate code point - make <d800 */
803 c1-=0x2800;
804 }
805
806 if(
807 (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
808 (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
809 ) {
810 /* part of a surrogate pair, leave >=d800 */
811 } else {
812 /* BMP code point - may be surrogate code point - make <d800 */
813 c2-=0x2800;
814 }
815 }
816
817 /* now c1 and c2 are in the requested (code unit or code point) order */
818 return (int32_t)c1-(int32_t)c2;
819 }
820
821 /*
822 * Compare two strings as presented by UCharIterators.
823 * Use code unit or code point order.
824 * When the function returns, it is undefined where the iterators
825 * have stopped.
826 */
827 U_CAPI int32_t U_EXPORT2
u_strCompareIter(UCharIterator * iter1,UCharIterator * iter2,UBool codePointOrder)828 u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
829 UChar32 c1, c2;
830
831 /* argument checking */
832 if(iter1==NULL || iter2==NULL) {
833 return 0; /* bad arguments */
834 }
835 if(iter1==iter2) {
836 return 0; /* identical iterators */
837 }
838
839 /* reset iterators to start? */
840 iter1->move(iter1, 0, UITER_START);
841 iter2->move(iter2, 0, UITER_START);
842
843 /* compare identical prefixes - they do not need to be fixed up */
844 for(;;) {
845 c1=iter1->next(iter1);
846 c2=iter2->next(iter2);
847 if(c1!=c2) {
848 break;
849 }
850 if(c1==-1) {
851 return 0;
852 }
853 }
854
855 /* if both values are in or above the surrogate range, fix them up */
856 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
857 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
858 if(
859 (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
860 (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
861 ) {
862 /* part of a surrogate pair, leave >=d800 */
863 } else {
864 /* BMP code point - may be surrogate code point - make <d800 */
865 c1-=0x2800;
866 }
867
868 if(
869 (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
870 (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
871 ) {
872 /* part of a surrogate pair, leave >=d800 */
873 } else {
874 /* BMP code point - may be surrogate code point - make <d800 */
875 c2-=0x2800;
876 }
877 }
878
879 /* now c1 and c2 are in the requested (code unit or code point) order */
880 return (int32_t)c1-(int32_t)c2;
881 }
882
883 #if 0
884 /*
885 * u_strCompareIter() does not leave the iterators _on_ the different units.
886 * This is possible but would cost a few extra indirect function calls to back
887 * up if the last unit (c1 or c2 respectively) was >=0.
888 *
889 * Consistently leaving them _behind_ the different units is not an option
890 * because the current "unit" is the end of the string if that is reached,
891 * and in such a case the iterator does not move.
892 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
893 * of their strings. Calling previous() on each does not move them to where
894 * the comparison fails.
895 *
896 * So the simplest semantics is to not define where the iterators end up.
897 *
898 * The following fragment is part of what would need to be done for backing up.
899 */
900 void fragment {
901 /* iff a surrogate is part of a surrogate pair, leave >=d800 */
902 if(c1<=0xdbff) {
903 if(!U16_IS_TRAIL(iter1->current(iter1))) {
904 /* lead surrogate code point - make <d800 */
905 c1-=0x2800;
906 }
907 } else if(c1<=0xdfff) {
908 int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
909 iter1->previous(iter1); /* ==c1 */
910 if(!U16_IS_LEAD(iter1->previous(iter1))) {
911 /* trail surrogate code point - make <d800 */
912 c1-=0x2800;
913 }
914 /* go back to behind where the difference is */
915 iter1->move(iter1, idx, UITER_ZERO);
916 } else /* 0xe000<=c1<=0xffff */ {
917 /* BMP code point - make <d800 */
918 c1-=0x2800;
919 }
920 }
921 #endif
922
923 U_CAPI int32_t U_EXPORT2
u_strCompare(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,UBool codePointOrder)924 u_strCompare(const UChar *s1, int32_t length1,
925 const UChar *s2, int32_t length2,
926 UBool codePointOrder) {
927 /* argument checking */
928 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
929 return 0;
930 }
931 return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder);
932 }
933
934 /* String compare in code point order - u_strcmp() compares in code unit order. */
935 U_CAPI int32_t U_EXPORT2
u_strcmpCodePointOrder(const UChar * s1,const UChar * s2)936 u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
937 return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE);
938 }
939
940 U_CAPI int32_t U_EXPORT2
u_strncmp(const UChar * s1,const UChar * s2,int32_t n)941 u_strncmp(const UChar *s1,
942 const UChar *s2,
943 int32_t n)
944 {
945 if(n > 0) {
946 int32_t rc;
947 for(;;) {
948 rc = (int32_t)*s1 - (int32_t)*s2;
949 if(rc != 0 || *s1 == 0 || --n == 0) {
950 return rc;
951 }
952 ++s1;
953 ++s2;
954 }
955 } else {
956 return 0;
957 }
958 }
959
960 U_CAPI int32_t U_EXPORT2
u_strncmpCodePointOrder(const UChar * s1,const UChar * s2,int32_t n)961 u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) {
962 return uprv_strCompare(s1, n, s2, n, TRUE, TRUE);
963 }
964
965 U_CAPI UChar* U_EXPORT2
u_strcpy(UChar * dst,const UChar * src)966 u_strcpy(UChar *dst,
967 const UChar *src)
968 {
969 UChar *anchor = dst; /* save a pointer to start of dst */
970
971 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
972 }
973
974 return anchor;
975 }
976
977 U_CAPI UChar* U_EXPORT2
u_strncpy(UChar * dst,const UChar * src,int32_t n)978 u_strncpy(UChar *dst,
979 const UChar *src,
980 int32_t n)
981 {
982 UChar *anchor = dst; /* save a pointer to start of dst */
983
984 /* copy string 2 over */
985 while(n > 0 && (*(dst++) = *(src++)) != 0) {
986 --n;
987 }
988
989 return anchor;
990 }
991
992 U_CAPI int32_t U_EXPORT2
u_strlen(const UChar * s)993 u_strlen(const UChar *s)
994 {
995 #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
996 return (int32_t)uprv_wcslen((const wchar_t *)s);
997 #else
998 const UChar *t = s;
999 while(*t != 0) {
1000 ++t;
1001 }
1002 return t - s;
1003 #endif
1004 }
1005
1006 U_CAPI int32_t U_EXPORT2
u_countChar32(const UChar * s,int32_t length)1007 u_countChar32(const UChar *s, int32_t length) {
1008 int32_t count;
1009
1010 if(s==NULL || length<-1) {
1011 return 0;
1012 }
1013
1014 count=0;
1015 if(length>=0) {
1016 while(length>0) {
1017 ++count;
1018 if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) {
1019 s+=2;
1020 length-=2;
1021 } else {
1022 ++s;
1023 --length;
1024 }
1025 }
1026 } else /* length==-1 */ {
1027 UChar c;
1028
1029 for(;;) {
1030 if((c=*s++)==0) {
1031 break;
1032 }
1033 ++count;
1034
1035 /*
1036 * sufficient to look ahead one because of UTF-16;
1037 * safe to look ahead one because at worst that would be the terminating NUL
1038 */
1039 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1040 ++s;
1041 }
1042 }
1043 }
1044 return count;
1045 }
1046
1047 U_CAPI UBool U_EXPORT2
u_strHasMoreChar32Than(const UChar * s,int32_t length,int32_t number)1048 u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) {
1049
1050 if(number<0) {
1051 return TRUE;
1052 }
1053 if(s==NULL || length<-1) {
1054 return FALSE;
1055 }
1056
1057 if(length==-1) {
1058 /* s is NUL-terminated */
1059 UChar c;
1060
1061 /* count code points until they exceed */
1062 for(;;) {
1063 if((c=*s++)==0) {
1064 return FALSE;
1065 }
1066 if(number==0) {
1067 return TRUE;
1068 }
1069 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1070 ++s;
1071 }
1072 --number;
1073 }
1074 } else {
1075 /* length>=0 known */
1076 const UChar *limit;
1077 int32_t maxSupplementary;
1078
1079 /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1080 if(((length+1)/2)>number) {
1081 return TRUE;
1082 }
1083
1084 /* check if s does not even contain enough UChars */
1085 maxSupplementary=length-number;
1086 if(maxSupplementary<=0) {
1087 return FALSE;
1088 }
1089 /* there are maxSupplementary=length-number more UChars than asked-for code points */
1090
1091 /*
1092 * count code points until they exceed and also check that there are
1093 * no more than maxSupplementary supplementary code points (UChar pairs)
1094 */
1095 limit=s+length;
1096 for(;;) {
1097 if(s==limit) {
1098 return FALSE;
1099 }
1100 if(number==0) {
1101 return TRUE;
1102 }
1103 if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
1104 ++s;
1105 if(--maxSupplementary<=0) {
1106 /* too many pairs - too few code points */
1107 return FALSE;
1108 }
1109 }
1110 --number;
1111 }
1112 }
1113 }
1114
1115 U_CAPI UChar * U_EXPORT2
u_memcpy(UChar * dest,const UChar * src,int32_t count)1116 u_memcpy(UChar *dest, const UChar *src, int32_t count) {
1117 if(count > 0) {
1118 uprv_memcpy(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1119 }
1120 return dest;
1121 }
1122
1123 U_CAPI UChar * U_EXPORT2
u_memmove(UChar * dest,const UChar * src,int32_t count)1124 u_memmove(UChar *dest, const UChar *src, int32_t count) {
1125 if(count > 0) {
1126 uprv_memmove(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1127 }
1128 return dest;
1129 }
1130
1131 U_CAPI UChar * U_EXPORT2
u_memset(UChar * dest,UChar c,int32_t count)1132 u_memset(UChar *dest, UChar c, int32_t count) {
1133 if(count > 0) {
1134 UChar *ptr = dest;
1135 UChar *limit = dest + count;
1136
1137 while (ptr < limit) {
1138 *(ptr++) = c;
1139 }
1140 }
1141 return dest;
1142 }
1143
1144 U_CAPI int32_t U_EXPORT2
u_memcmp(const UChar * buf1,const UChar * buf2,int32_t count)1145 u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) {
1146 if(count > 0) {
1147 const UChar *limit = buf1 + count;
1148 int32_t result;
1149
1150 while (buf1 < limit) {
1151 result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
1152 if (result != 0) {
1153 return result;
1154 }
1155 buf1++;
1156 buf2++;
1157 }
1158 }
1159 return 0;
1160 }
1161
1162 U_CAPI int32_t U_EXPORT2
u_memcmpCodePointOrder(const UChar * s1,const UChar * s2,int32_t count)1163 u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
1164 return uprv_strCompare(s1, count, s2, count, FALSE, TRUE);
1165 }
1166
1167 /* u_unescape & support fns ------------------------------------------------- */
1168
1169 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1170 static const UChar UNESCAPE_MAP[] = {
1171 /*" 0x22, 0x22 */
1172 /*' 0x27, 0x27 */
1173 /*? 0x3F, 0x3F */
1174 /*\ 0x5C, 0x5C */
1175 /*a*/ 0x61, 0x07,
1176 /*b*/ 0x62, 0x08,
1177 /*e*/ 0x65, 0x1b,
1178 /*f*/ 0x66, 0x0c,
1179 /*n*/ 0x6E, 0x0a,
1180 /*r*/ 0x72, 0x0d,
1181 /*t*/ 0x74, 0x09,
1182 /*v*/ 0x76, 0x0b
1183 };
1184 enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
1185
1186 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
_digit8(UChar c)1187 static int8_t _digit8(UChar c) {
1188 if (c >= 0x0030 && c <= 0x0037) {
1189 return (int8_t)(c - 0x0030);
1190 }
1191 return -1;
1192 }
1193
1194 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
_digit16(UChar c)1195 static int8_t _digit16(UChar c) {
1196 if (c >= 0x0030 && c <= 0x0039) {
1197 return (int8_t)(c - 0x0030);
1198 }
1199 if (c >= 0x0041 && c <= 0x0046) {
1200 return (int8_t)(c - (0x0041 - 10));
1201 }
1202 if (c >= 0x0061 && c <= 0x0066) {
1203 return (int8_t)(c - (0x0061 - 10));
1204 }
1205 return -1;
1206 }
1207
1208 /* Parse a single escape sequence. Although this method deals in
1209 * UChars, it does not use C++ or UnicodeString. This allows it to
1210 * be used from C contexts. */
1211 U_CAPI UChar32 U_EXPORT2
u_unescapeAt(UNESCAPE_CHAR_AT charAt,int32_t * offset,int32_t length,void * context)1212 u_unescapeAt(UNESCAPE_CHAR_AT charAt,
1213 int32_t *offset,
1214 int32_t length,
1215 void *context) {
1216
1217 int32_t start = *offset;
1218 UChar c;
1219 UChar32 result = 0;
1220 int8_t n = 0;
1221 int8_t minDig = 0;
1222 int8_t maxDig = 0;
1223 int8_t bitsPerDigit = 4;
1224 int8_t dig;
1225 int32_t i;
1226 UBool braces = FALSE;
1227
1228 /* Check that offset is in range */
1229 if (*offset < 0 || *offset >= length) {
1230 goto err;
1231 }
1232
1233 /* Fetch first UChar after '\\' */
1234 c = charAt((*offset)++, context);
1235
1236 /* Convert hexadecimal and octal escapes */
1237 switch (c) {
1238 case 0x0075 /*'u'*/:
1239 minDig = maxDig = 4;
1240 break;
1241 case 0x0055 /*'U'*/:
1242 minDig = maxDig = 8;
1243 break;
1244 case 0x0078 /*'x'*/:
1245 minDig = 1;
1246 if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
1247 ++(*offset);
1248 braces = TRUE;
1249 maxDig = 8;
1250 } else {
1251 maxDig = 2;
1252 }
1253 break;
1254 default:
1255 dig = _digit8(c);
1256 if (dig >= 0) {
1257 minDig = 1;
1258 maxDig = 3;
1259 n = 1; /* Already have first octal digit */
1260 bitsPerDigit = 3;
1261 result = dig;
1262 }
1263 break;
1264 }
1265 if (minDig != 0) {
1266 while (*offset < length && n < maxDig) {
1267 c = charAt(*offset, context);
1268 dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
1269 if (dig < 0) {
1270 break;
1271 }
1272 result = (result << bitsPerDigit) | dig;
1273 ++(*offset);
1274 ++n;
1275 }
1276 if (n < minDig) {
1277 goto err;
1278 }
1279 if (braces) {
1280 if (c != 0x7D /*}*/) {
1281 goto err;
1282 }
1283 ++(*offset);
1284 }
1285 if (result < 0 || result >= 0x110000) {
1286 goto err;
1287 }
1288 /* If an escape sequence specifies a lead surrogate, see if
1289 * there is a trail surrogate after it, either as an escape or
1290 * as a literal. If so, join them up into a supplementary.
1291 */
1292 if (*offset < length && U16_IS_LEAD(result)) {
1293 int32_t ahead = *offset + 1;
1294 c = charAt(*offset, context);
1295 if (c == 0x5C /*'\\'*/ && ahead < length) {
1296 c = (UChar) u_unescapeAt(charAt, &ahead, length, context);
1297 }
1298 if (U16_IS_TRAIL(c)) {
1299 *offset = ahead;
1300 result = U16_GET_SUPPLEMENTARY(result, c);
1301 }
1302 }
1303 return result;
1304 }
1305
1306 /* Convert C-style escapes in table */
1307 for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
1308 if (c == UNESCAPE_MAP[i]) {
1309 return UNESCAPE_MAP[i+1];
1310 } else if (c < UNESCAPE_MAP[i]) {
1311 break;
1312 }
1313 }
1314
1315 /* Map \cX to control-X: X & 0x1F */
1316 if (c == 0x0063 /*'c'*/ && *offset < length) {
1317 c = charAt((*offset)++, context);
1318 if (U16_IS_LEAD(c) && *offset < length) {
1319 UChar c2 = charAt(*offset, context);
1320 if (U16_IS_TRAIL(c2)) {
1321 ++(*offset);
1322 c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */
1323 }
1324 }
1325 return 0x1F & c;
1326 }
1327
1328 /* If no special forms are recognized, then consider
1329 * the backslash to generically escape the next character.
1330 * Deal with surrogate pairs. */
1331 if (U16_IS_LEAD(c) && *offset < length) {
1332 UChar c2 = charAt(*offset, context);
1333 if (U16_IS_TRAIL(c2)) {
1334 ++(*offset);
1335 return U16_GET_SUPPLEMENTARY(c, c2);
1336 }
1337 }
1338 return c;
1339
1340 err:
1341 /* Invalid escape sequence */
1342 *offset = start; /* Reset to initial value */
1343 return (UChar32)0xFFFFFFFF;
1344 }
1345
1346 /* u_unescapeAt() callback to return a UChar from a char* */
1347 static UChar U_CALLCONV
_charPtr_charAt(int32_t offset,void * context)1348 _charPtr_charAt(int32_t offset, void *context) {
1349 UChar c16;
1350 /* It would be more efficient to access the invariant tables
1351 * directly but there is no API for that. */
1352 u_charsToUChars(((char*) context) + offset, &c16, 1);
1353 return c16;
1354 }
1355
1356 /* Append an escape-free segment of the text; used by u_unescape() */
_appendUChars(UChar * dest,int32_t destCapacity,const char * src,int32_t srcLen)1357 static void _appendUChars(UChar *dest, int32_t destCapacity,
1358 const char *src, int32_t srcLen) {
1359 if (destCapacity < 0) {
1360 destCapacity = 0;
1361 }
1362 if (srcLen > destCapacity) {
1363 srcLen = destCapacity;
1364 }
1365 u_charsToUChars(src, dest, srcLen);
1366 }
1367
1368 /* Do an invariant conversion of char* -> UChar*, with escape parsing */
1369 U_CAPI int32_t U_EXPORT2
u_unescape(const char * src,UChar * dest,int32_t destCapacity)1370 u_unescape(const char *src, UChar *dest, int32_t destCapacity) {
1371 const char *segment = src;
1372 int32_t i = 0;
1373 char c;
1374
1375 while ((c=*src) != 0) {
1376 /* '\\' intentionally written as compiler-specific
1377 * character constant to correspond to compiler-specific
1378 * char* constants. */
1379 if (c == '\\') {
1380 int32_t lenParsed = 0;
1381 UChar32 c32;
1382 if (src != segment) {
1383 if (dest != NULL) {
1384 _appendUChars(dest + i, destCapacity - i,
1385 segment, (int32_t)(src - segment));
1386 }
1387 i += (int32_t)(src - segment);
1388 }
1389 ++src; /* advance past '\\' */
1390 c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src);
1391 if (lenParsed == 0) {
1392 goto err;
1393 }
1394 src += lenParsed; /* advance past escape seq. */
1395 if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) {
1396 U16_APPEND_UNSAFE(dest, i, c32);
1397 } else {
1398 i += U16_LENGTH(c32);
1399 }
1400 segment = src;
1401 } else {
1402 ++src;
1403 }
1404 }
1405 if (src != segment) {
1406 if (dest != NULL) {
1407 _appendUChars(dest + i, destCapacity - i,
1408 segment, (int32_t)(src - segment));
1409 }
1410 i += (int32_t)(src - segment);
1411 }
1412 if (dest != NULL && i < destCapacity) {
1413 dest[i] = 0;
1414 }
1415 return i;
1416
1417 err:
1418 if (dest != NULL && destCapacity > 0) {
1419 *dest = 0;
1420 }
1421 return 0;
1422 }
1423
1424 /* NUL-termination of strings ----------------------------------------------- */
1425
1426 /**
1427 * NUL-terminate a string no matter what its type.
1428 * Set warning and error codes accordingly.
1429 */
1430 #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \
1431 if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \
1432 /* not a public function, so no complete argument checking */ \
1433 \
1434 if(length<0) { \
1435 /* assume that the caller handles this */ \
1436 } else if(length<destCapacity) { \
1437 /* NUL-terminate the string, the NUL fits */ \
1438 dest[length]=0; \
1439 /* unset the not-terminated warning but leave all others */ \
1440 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
1441 *pErrorCode=U_ZERO_ERROR; \
1442 } \
1443 } else if(length==destCapacity) { \
1444 /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1445 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
1446 } else /* length>destCapacity */ { \
1447 /* even the string itself did not fit - set an error code */ \
1448 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
1449 } \
1450 }
1451
1452 U_CAPI int32_t U_EXPORT2
u_terminateUChars(UChar * dest,int32_t destCapacity,int32_t length,UErrorCode * pErrorCode)1453 u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1454 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1455 return length;
1456 }
1457
1458 U_CAPI int32_t U_EXPORT2
u_terminateChars(char * dest,int32_t destCapacity,int32_t length,UErrorCode * pErrorCode)1459 u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1460 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1461 return length;
1462 }
1463
1464 U_CAPI int32_t U_EXPORT2
u_terminateUChar32s(UChar32 * dest,int32_t destCapacity,int32_t length,UErrorCode * pErrorCode)1465 u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1466 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1467 return length;
1468 }
1469
1470 U_CAPI int32_t U_EXPORT2
u_terminateWChars(wchar_t * dest,int32_t destCapacity,int32_t length,UErrorCode * pErrorCode)1471 u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1472 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1473 return length;
1474 }
1475
1476 // Compute the hash code for a string -------------------------------------- ***
1477
1478 // Moved here from uhash.c so that UnicodeString::hashCode() does not depend
1479 // on UHashtable code.
1480
1481 /*
1482 Compute the hash by iterating sparsely over about 32 (up to 63)
1483 characters spaced evenly through the string. For each character,
1484 multiply the previous hash value by a prime number and add the new
1485 character in, like a linear congruential random number generator,
1486 producing a pseudorandom deterministic value well distributed over
1487 the output range. [LIU]
1488 */
1489
1490 #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \
1491 uint32_t hash = 0; \
1492 const TYPE *p = (const TYPE*) STR; \
1493 if (p != NULL) { \
1494 int32_t len = (int32_t)(STRLEN); \
1495 int32_t inc = ((len - 32) / 32) + 1; \
1496 const TYPE *limit = p + len; \
1497 while (p<limit) { \
1498 hash = (hash * 37) + DEREF; \
1499 p += inc; \
1500 } \
1501 } \
1502 return static_cast<int32_t>(hash)
1503
1504 /* Used by UnicodeString to compute its hashcode - Not public API. */
1505 U_CAPI int32_t U_EXPORT2
ustr_hashUCharsN(const UChar * str,int32_t length)1506 ustr_hashUCharsN(const UChar *str, int32_t length) {
1507 STRING_HASH(UChar, str, length, *p);
1508 }
1509
1510 U_CAPI int32_t U_EXPORT2
ustr_hashCharsN(const char * str,int32_t length)1511 ustr_hashCharsN(const char *str, int32_t length) {
1512 STRING_HASH(uint8_t, str, length, *p);
1513 }
1514
1515 U_CAPI int32_t U_EXPORT2
ustr_hashICharsN(const char * str,int32_t length)1516 ustr_hashICharsN(const char *str, int32_t length) {
1517 STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p));
1518 }
1519