1 /*
2 ******************************************************************************
3 * Copyright (C) 1999-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
6 *
7 * File unistr.cpp
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 09/25/98 stephen Creation.
13 * 04/20/99 stephen Overhauled per 4/16 code review.
14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
16 * Replaceable.
17 * 06/25/01 grhoten Removed the dependency on iostream
18 ******************************************************************************
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/appendable.h"
23 #include "unicode/putil.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
30 #include "uelement.h"
31 #include "ustr_imp.h"
32 #include "umutex.h"
33 #include "uassert.h"
34
35 #if 0
36
37 #include <iostream>
38 using namespace std;
39
40 //DEBUGGING
41 void
42 print(const UnicodeString& s,
43 const char *name)
44 {
45 UChar c;
46 cout << name << ":|";
47 for(int i = 0; i < s.length(); ++i) {
48 c = s[i];
49 if(c>= 0x007E || c < 0x0020)
50 cout << "[0x" << hex << s[i] << "]";
51 else
52 cout << (char) s[i];
53 }
54 cout << '|' << endl;
55 }
56
57 void
58 print(const UChar *s,
59 int32_t len,
60 const char *name)
61 {
62 UChar c;
63 cout << name << ":|";
64 for(int i = 0; i < len; ++i) {
65 c = s[i];
66 if(c>= 0x007E || c < 0x0020)
67 cout << "[0x" << hex << s[i] << "]";
68 else
69 cout << (char) s[i];
70 }
71 cout << '|' << endl;
72 }
73 // END DEBUGGING
74 #endif
75
76 // Local function definitions for now
77
78 // need to copy areas that may overlap
79 static
80 inline void
us_arrayCopy(const UChar * src,int32_t srcStart,UChar * dst,int32_t dstStart,int32_t count)81 us_arrayCopy(const UChar *src, int32_t srcStart,
82 UChar *dst, int32_t dstStart, int32_t count)
83 {
84 if(count>0) {
85 uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
86 }
87 }
88
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
90 U_CDECL_BEGIN
91 static UChar U_CALLCONV
UnicodeString_charAt(int32_t offset,void * context)92 UnicodeString_charAt(int32_t offset, void *context) {
93 return ((icu::UnicodeString*) context)->charAt(offset);
94 }
95 U_CDECL_END
96
97 U_NAMESPACE_BEGIN
98
99 /* The Replaceable virtual destructor can't be defined in the header
100 due to how AIX works with multiple definitions of virtual functions.
101 */
~Replaceable()102 Replaceable::~Replaceable() {}
103
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
105
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
108 return
109 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
110 append(s1).
111 append(s2);
112 }
113
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 // have a chance to automatically inline.
117 //========================================
118
119 void
addRef()120 UnicodeString::addRef() {
121 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
122 }
123
124 int32_t
removeRef()125 UnicodeString::removeRef() {
126 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
127 }
128
129 int32_t
refCount() const130 UnicodeString::refCount() const {
131 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
132 }
133
134 void
releaseArray()135 UnicodeString::releaseArray() {
136 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
137 uprv_free((int32_t *)fUnion.fFields.fArray - 1);
138 }
139 }
140
141
142
143 //========================================
144 // Constructors
145 //========================================
146
147 // The default constructor is inline in unistr.h.
148
UnicodeString(int32_t capacity,UChar32 c,int32_t count)149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
150 fUnion.fFields.fLengthAndFlags = 0;
151 if(count <= 0 || (uint32_t)c > 0x10ffff) {
152 // just allocate and do not do anything else
153 allocate(capacity);
154 } else {
155 // count > 0, allocate and fill the new string with count c's
156 int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
157 if(capacity < length) {
158 capacity = length;
159 }
160 if(allocate(capacity)) {
161 UChar *array = getArrayStart();
162 int32_t i = 0;
163
164 // fill the new string with c
165 if(unitCount == 1) {
166 // fill with length UChars
167 while(i < length) {
168 array[i++] = (UChar)c;
169 }
170 } else {
171 // get the code units for c
172 UChar units[U16_MAX_LENGTH];
173 U16_APPEND_UNSAFE(units, i, c);
174
175 // now it must be i==unitCount
176 i = 0;
177
178 // for Unicode, unitCount can only be 1, 2, 3, or 4
179 // 1 is handled above
180 while(i < length) {
181 int32_t unitIdx = 0;
182 while(unitIdx < unitCount) {
183 array[i++]=units[unitIdx++];
184 }
185 }
186 }
187 }
188 setLength(length);
189 }
190 }
191
UnicodeString(UChar ch)192 UnicodeString::UnicodeString(UChar ch) {
193 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194 fUnion.fStackFields.fBuffer[0] = ch;
195 }
196
UnicodeString(UChar32 ch)197 UnicodeString::UnicodeString(UChar32 ch) {
198 fUnion.fFields.fLengthAndFlags = kShortString;
199 int32_t i = 0;
200 UBool isError = FALSE;
201 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202 // We test isError so that the compiler does not complain that we don't.
203 // If isError then i==0 which is what we want anyway.
204 if(!isError) {
205 setShortLength(i);
206 }
207 }
208
UnicodeString(const UChar * text)209 UnicodeString::UnicodeString(const UChar *text) {
210 fUnion.fFields.fLengthAndFlags = kShortString;
211 doReplace(0, 0, text, 0, -1);
212 }
213
UnicodeString(const UChar * text,int32_t textLength)214 UnicodeString::UnicodeString(const UChar *text,
215 int32_t textLength) {
216 fUnion.fFields.fLengthAndFlags = kShortString;
217 doReplace(0, 0, text, 0, textLength);
218 }
219
UnicodeString(UBool isTerminated,const UChar * text,int32_t textLength)220 UnicodeString::UnicodeString(UBool isTerminated,
221 const UChar *text,
222 int32_t textLength) {
223 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224 if(text == NULL) {
225 // treat as an empty string, do not alias
226 setToEmpty();
227 } else if(textLength < -1 ||
228 (textLength == -1 && !isTerminated) ||
229 (textLength >= 0 && isTerminated && text[textLength] != 0)
230 ) {
231 setToBogus();
232 } else {
233 if(textLength == -1) {
234 // text is terminated, or else it would have failed the above test
235 textLength = u_strlen(text);
236 }
237 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
238 }
239 }
240
UnicodeString(UChar * buff,int32_t buffLength,int32_t buffCapacity)241 UnicodeString::UnicodeString(UChar *buff,
242 int32_t buffLength,
243 int32_t buffCapacity) {
244 fUnion.fFields.fLengthAndFlags = kWritableAlias;
245 if(buff == NULL) {
246 // treat as an empty string, do not alias
247 setToEmpty();
248 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
249 setToBogus();
250 } else {
251 if(buffLength == -1) {
252 // fLength = u_strlen(buff); but do not look beyond buffCapacity
253 const UChar *p = buff, *limit = buff + buffCapacity;
254 while(p != limit && *p != 0) {
255 ++p;
256 }
257 buffLength = (int32_t)(p - buff);
258 }
259 setArray(buff, buffLength, buffCapacity);
260 }
261 }
262
UnicodeString(const char * src,int32_t length,EInvariant)263 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
264 fUnion.fFields.fLengthAndFlags = kShortString;
265 if(src==NULL) {
266 // treat as an empty string
267 } else {
268 if(length<0) {
269 length=(int32_t)uprv_strlen(src);
270 }
271 if(cloneArrayIfNeeded(length, length, FALSE)) {
272 u_charsToUChars(src, getArrayStart(), length);
273 setLength(length);
274 } else {
275 setToBogus();
276 }
277 }
278 }
279
280 #if U_CHARSET_IS_UTF8
281
UnicodeString(const char * codepageData)282 UnicodeString::UnicodeString(const char *codepageData) {
283 fUnion.fFields.fLengthAndFlags = kShortString;
284 if(codepageData != 0) {
285 setToUTF8(codepageData);
286 }
287 }
288
UnicodeString(const char * codepageData,int32_t dataLength)289 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
290 fUnion.fFields.fLengthAndFlags = kShortString;
291 // if there's nothing to convert, do nothing
292 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
293 return;
294 }
295 if(dataLength == -1) {
296 dataLength = (int32_t)uprv_strlen(codepageData);
297 }
298 setToUTF8(StringPiece(codepageData, dataLength));
299 }
300
301 // else see unistr_cnv.cpp
302 #endif
303
UnicodeString(const UnicodeString & that)304 UnicodeString::UnicodeString(const UnicodeString& that) {
305 fUnion.fFields.fLengthAndFlags = kShortString;
306 copyFrom(that);
307 }
308
UnicodeString(const UnicodeString & that,int32_t srcStart)309 UnicodeString::UnicodeString(const UnicodeString& that,
310 int32_t srcStart) {
311 fUnion.fFields.fLengthAndFlags = kShortString;
312 setTo(that, srcStart);
313 }
314
UnicodeString(const UnicodeString & that,int32_t srcStart,int32_t srcLength)315 UnicodeString::UnicodeString(const UnicodeString& that,
316 int32_t srcStart,
317 int32_t srcLength) {
318 fUnion.fFields.fLengthAndFlags = kShortString;
319 setTo(that, srcStart, srcLength);
320 }
321
322 // Replaceable base class clone() default implementation, does not clone
323 Replaceable *
clone() const324 Replaceable::clone() const {
325 return NULL;
326 }
327
328 // UnicodeString overrides clone() with a real implementation
329 Replaceable *
clone() const330 UnicodeString::clone() const {
331 return new UnicodeString(*this);
332 }
333
334 //========================================
335 // array allocation
336 //========================================
337
338 UBool
allocate(int32_t capacity)339 UnicodeString::allocate(int32_t capacity) {
340 if(capacity <= US_STACKBUF_SIZE) {
341 fUnion.fFields.fLengthAndFlags = kShortString;
342 } else {
343 // count bytes for the refCounter and the string capacity, and
344 // round up to a multiple of 16; then divide by 4 and allocate int32_t's
345 // to be safely aligned for the refCount
346 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
347 int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
348 int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
349 if(array != 0) {
350 // set initial refCount and point behind the refCount
351 *array++ = 1;
352
353 // have fArray point to the first UChar
354 fUnion.fFields.fArray = (UChar *)array;
355 fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
356 fUnion.fFields.fLengthAndFlags = kLongString;
357 } else {
358 fUnion.fFields.fLengthAndFlags = kIsBogus;
359 fUnion.fFields.fArray = 0;
360 fUnion.fFields.fCapacity = 0;
361 return FALSE;
362 }
363 }
364 return TRUE;
365 }
366
367 //========================================
368 // Destructor
369 //========================================
~UnicodeString()370 UnicodeString::~UnicodeString()
371 {
372 releaseArray();
373 }
374
375 //========================================
376 // Factory methods
377 //========================================
378
fromUTF8(const StringPiece & utf8)379 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
380 UnicodeString result;
381 result.setToUTF8(utf8);
382 return result;
383 }
384
fromUTF32(const UChar32 * utf32,int32_t length)385 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
386 UnicodeString result;
387 int32_t capacity;
388 // Most UTF-32 strings will be BMP-only and result in a same-length
389 // UTF-16 string. We overestimate the capacity just slightly,
390 // just in case there are a few supplementary characters.
391 if(length <= US_STACKBUF_SIZE) {
392 capacity = US_STACKBUF_SIZE;
393 } else {
394 capacity = length + (length >> 4) + 4;
395 }
396 do {
397 UChar *utf16 = result.getBuffer(capacity);
398 int32_t length16;
399 UErrorCode errorCode = U_ZERO_ERROR;
400 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
401 utf32, length,
402 0xfffd, // Substitution character.
403 NULL, // Don't care about number of substitutions.
404 &errorCode);
405 result.releaseBuffer(length16);
406 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
407 capacity = length16 + 1; // +1 for the terminating NUL.
408 continue;
409 } else if(U_FAILURE(errorCode)) {
410 result.setToBogus();
411 }
412 break;
413 } while(TRUE);
414 return result;
415 }
416
417 //========================================
418 // Assignment
419 //========================================
420
421 UnicodeString &
operator =(const UnicodeString & src)422 UnicodeString::operator=(const UnicodeString &src) {
423 return copyFrom(src);
424 }
425
426 UnicodeString &
fastCopyFrom(const UnicodeString & src)427 UnicodeString::fastCopyFrom(const UnicodeString &src) {
428 return copyFrom(src, TRUE);
429 }
430
431 UnicodeString &
copyFrom(const UnicodeString & src,UBool fastCopy)432 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
433 // if assigning to ourselves, do nothing
434 if(this == &src) {
435 return *this;
436 }
437
438 // is the right side bogus?
439 if(src.isBogus()) {
440 setToBogus();
441 return *this;
442 }
443
444 // delete the current contents
445 releaseArray();
446
447 if(src.isEmpty()) {
448 // empty string - use the stack buffer
449 setToEmpty();
450 return *this;
451 }
452
453 // fLength>0 and not an "open" src.getBuffer(minCapacity)
454 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
455 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
456 case kShortString:
457 // short string using the stack buffer, do the same
458 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
459 getShortLength() * U_SIZEOF_UCHAR);
460 break;
461 case kLongString:
462 // src uses a refCounted string buffer, use that buffer with refCount
463 // src is const, use a cast - we don't actually change it
464 ((UnicodeString &)src).addRef();
465 // copy all fields, share the reference-counted buffer
466 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
467 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
468 if(!hasShortLength()) {
469 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
470 }
471 break;
472 case kReadonlyAlias:
473 if(fastCopy) {
474 // src is a readonly alias, do the same
475 // -> maintain the readonly alias as such
476 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
477 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
478 if(!hasShortLength()) {
479 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
480 }
481 break;
482 }
483 // else if(!fastCopy) fall through to case kWritableAlias
484 // -> allocate a new buffer and copy the contents
485 case kWritableAlias: {
486 // src is a writable alias; we make a copy of that instead
487 int32_t srcLength = src.length();
488 if(allocate(srcLength)) {
489 uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
490 setLength(srcLength);
491 break;
492 }
493 // if there is not enough memory, then fall through to setting to bogus
494 }
495 default:
496 // if src is bogus, set ourselves to bogus
497 // do not call setToBogus() here because fArray and flags are not consistent here
498 fUnion.fFields.fLengthAndFlags = kIsBogus;
499 fUnion.fFields.fArray = 0;
500 fUnion.fFields.fCapacity = 0;
501 break;
502 }
503
504 return *this;
505 }
506
507 //========================================
508 // Miscellaneous operations
509 //========================================
510
unescape() const511 UnicodeString UnicodeString::unescape() const {
512 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
513 const UChar *array = getBuffer();
514 int32_t len = length();
515 int32_t prev = 0;
516 for (int32_t i=0;;) {
517 if (i == len) {
518 result.append(array, prev, len - prev);
519 break;
520 }
521 if (array[i++] == 0x5C /*'\\'*/) {
522 result.append(array, prev, (i - 1) - prev);
523 UChar32 c = unescapeAt(i); // advances i
524 if (c < 0) {
525 result.remove(); // return empty string
526 break; // invalid escape sequence
527 }
528 result.append(c);
529 prev = i;
530 }
531 }
532 return result;
533 }
534
unescapeAt(int32_t & offset) const535 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
536 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
537 }
538
539 //========================================
540 // Read-only implementation
541 //========================================
542 UBool
doEquals(const UnicodeString & text,int32_t len) const543 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
544 // Requires: this & text not bogus and have same lengths.
545 // Byte-wise comparison works for equality regardless of endianness.
546 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
547 }
548
549 int8_t
doCompare(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const550 UnicodeString::doCompare( int32_t start,
551 int32_t length,
552 const UChar *srcChars,
553 int32_t srcStart,
554 int32_t srcLength) const
555 {
556 // compare illegal string values
557 if(isBogus()) {
558 return -1;
559 }
560
561 // pin indices to legal values
562 pinIndices(start, length);
563
564 if(srcChars == NULL) {
565 // treat const UChar *srcChars==NULL as an empty string
566 return length == 0 ? 0 : 1;
567 }
568
569 // get the correct pointer
570 const UChar *chars = getArrayStart();
571
572 chars += start;
573 srcChars += srcStart;
574
575 int32_t minLength;
576 int8_t lengthResult;
577
578 // get the srcLength if necessary
579 if(srcLength < 0) {
580 srcLength = u_strlen(srcChars + srcStart);
581 }
582
583 // are we comparing different lengths?
584 if(length != srcLength) {
585 if(length < srcLength) {
586 minLength = length;
587 lengthResult = -1;
588 } else {
589 minLength = srcLength;
590 lengthResult = 1;
591 }
592 } else {
593 minLength = length;
594 lengthResult = 0;
595 }
596
597 /*
598 * note that uprv_memcmp() returns an int but we return an int8_t;
599 * we need to take care not to truncate the result -
600 * one way to do this is to right-shift the value to
601 * move the sign bit into the lower 8 bits and making sure that this
602 * does not become 0 itself
603 */
604
605 if(minLength > 0 && chars != srcChars) {
606 int32_t result;
607
608 # if U_IS_BIG_ENDIAN
609 // big-endian: byte comparison works
610 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
611 if(result != 0) {
612 return (int8_t)(result >> 15 | 1);
613 }
614 # else
615 // little-endian: compare UChar units
616 do {
617 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
618 if(result != 0) {
619 return (int8_t)(result >> 15 | 1);
620 }
621 } while(--minLength > 0);
622 # endif
623 }
624 return lengthResult;
625 }
626
627 /* String compare in code point order - doCompare() compares in code unit order. */
628 int8_t
doCompareCodePointOrder(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const629 UnicodeString::doCompareCodePointOrder(int32_t start,
630 int32_t length,
631 const UChar *srcChars,
632 int32_t srcStart,
633 int32_t srcLength) const
634 {
635 // compare illegal string values
636 // treat const UChar *srcChars==NULL as an empty string
637 if(isBogus()) {
638 return -1;
639 }
640
641 // pin indices to legal values
642 pinIndices(start, length);
643
644 if(srcChars == NULL) {
645 srcStart = srcLength = 0;
646 }
647
648 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
649 /* translate the 32-bit result into an 8-bit one */
650 if(diff!=0) {
651 return (int8_t)(diff >> 15 | 1);
652 } else {
653 return 0;
654 }
655 }
656
657 int32_t
getLength() const658 UnicodeString::getLength() const {
659 return length();
660 }
661
662 UChar
getCharAt(int32_t offset) const663 UnicodeString::getCharAt(int32_t offset) const {
664 return charAt(offset);
665 }
666
667 UChar32
getChar32At(int32_t offset) const668 UnicodeString::getChar32At(int32_t offset) const {
669 return char32At(offset);
670 }
671
672 UChar32
char32At(int32_t offset) const673 UnicodeString::char32At(int32_t offset) const
674 {
675 int32_t len = length();
676 if((uint32_t)offset < (uint32_t)len) {
677 const UChar *array = getArrayStart();
678 UChar32 c;
679 U16_GET(array, 0, offset, len, c);
680 return c;
681 } else {
682 return kInvalidUChar;
683 }
684 }
685
686 int32_t
getChar32Start(int32_t offset) const687 UnicodeString::getChar32Start(int32_t offset) const {
688 if((uint32_t)offset < (uint32_t)length()) {
689 const UChar *array = getArrayStart();
690 U16_SET_CP_START(array, 0, offset);
691 return offset;
692 } else {
693 return 0;
694 }
695 }
696
697 int32_t
getChar32Limit(int32_t offset) const698 UnicodeString::getChar32Limit(int32_t offset) const {
699 int32_t len = length();
700 if((uint32_t)offset < (uint32_t)len) {
701 const UChar *array = getArrayStart();
702 U16_SET_CP_LIMIT(array, 0, offset, len);
703 return offset;
704 } else {
705 return len;
706 }
707 }
708
709 int32_t
countChar32(int32_t start,int32_t length) const710 UnicodeString::countChar32(int32_t start, int32_t length) const {
711 pinIndices(start, length);
712 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
713 return u_countChar32(getArrayStart()+start, length);
714 }
715
716 UBool
hasMoreChar32Than(int32_t start,int32_t length,int32_t number) const717 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
718 pinIndices(start, length);
719 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
720 return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
721 }
722
723 int32_t
moveIndex32(int32_t index,int32_t delta) const724 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
725 // pin index
726 int32_t len = length();
727 if(index<0) {
728 index=0;
729 } else if(index>len) {
730 index=len;
731 }
732
733 const UChar *array = getArrayStart();
734 if(delta>0) {
735 U16_FWD_N(array, index, len, delta);
736 } else {
737 U16_BACK_N(array, 0, index, -delta);
738 }
739
740 return index;
741 }
742
743 void
doExtract(int32_t start,int32_t length,UChar * dst,int32_t dstStart) const744 UnicodeString::doExtract(int32_t start,
745 int32_t length,
746 UChar *dst,
747 int32_t dstStart) const
748 {
749 // pin indices to legal values
750 pinIndices(start, length);
751
752 // do not copy anything if we alias dst itself
753 const UChar *array = getArrayStart();
754 if(array + start != dst + dstStart) {
755 us_arrayCopy(array, start, dst, dstStart, length);
756 }
757 }
758
759 int32_t
extract(UChar * dest,int32_t destCapacity,UErrorCode & errorCode) const760 UnicodeString::extract(UChar *dest, int32_t destCapacity,
761 UErrorCode &errorCode) const {
762 int32_t len = length();
763 if(U_SUCCESS(errorCode)) {
764 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
765 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
766 } else {
767 const UChar *array = getArrayStart();
768 if(len>0 && len<=destCapacity && array!=dest) {
769 uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
770 }
771 return u_terminateUChars(dest, destCapacity, len, &errorCode);
772 }
773 }
774
775 return len;
776 }
777
778 int32_t
extract(int32_t start,int32_t length,char * target,int32_t targetCapacity,enum EInvariant) const779 UnicodeString::extract(int32_t start,
780 int32_t length,
781 char *target,
782 int32_t targetCapacity,
783 enum EInvariant) const
784 {
785 // if the arguments are illegal, then do nothing
786 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
787 return 0;
788 }
789
790 // pin the indices to legal values
791 pinIndices(start, length);
792
793 if(length <= targetCapacity) {
794 u_UCharsToChars(getArrayStart() + start, target, length);
795 }
796 UErrorCode status = U_ZERO_ERROR;
797 return u_terminateChars(target, targetCapacity, length, &status);
798 }
799
800 UnicodeString
tempSubString(int32_t start,int32_t len) const801 UnicodeString::tempSubString(int32_t start, int32_t len) const {
802 pinIndices(start, len);
803 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
804 if(array==NULL) {
805 array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string
806 len=-2; // bogus result string
807 }
808 return UnicodeString(FALSE, array + start, len);
809 }
810
811 int32_t
toUTF8(int32_t start,int32_t len,char * target,int32_t capacity) const812 UnicodeString::toUTF8(int32_t start, int32_t len,
813 char *target, int32_t capacity) const {
814 pinIndices(start, len);
815 int32_t length8;
816 UErrorCode errorCode = U_ZERO_ERROR;
817 u_strToUTF8WithSub(target, capacity, &length8,
818 getBuffer() + start, len,
819 0xFFFD, // Standard substitution character.
820 NULL, // Don't care about number of substitutions.
821 &errorCode);
822 return length8;
823 }
824
825 #if U_CHARSET_IS_UTF8
826
827 int32_t
extract(int32_t start,int32_t len,char * target,uint32_t dstSize) const828 UnicodeString::extract(int32_t start, int32_t len,
829 char *target, uint32_t dstSize) const {
830 // if the arguments are illegal, then do nothing
831 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
832 return 0;
833 }
834 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
835 }
836
837 // else see unistr_cnv.cpp
838 #endif
839
840 void
extractBetween(int32_t start,int32_t limit,UnicodeString & target) const841 UnicodeString::extractBetween(int32_t start,
842 int32_t limit,
843 UnicodeString& target) const {
844 pinIndex(start);
845 pinIndex(limit);
846 doExtract(start, limit - start, target);
847 }
848
849 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
850 // as many bytes as the source has UChars.
851 // The "worst cases" are writing systems like Indic, Thai and CJK with
852 // 3:1 bytes:UChars.
853 void
toUTF8(ByteSink & sink) const854 UnicodeString::toUTF8(ByteSink &sink) const {
855 int32_t length16 = length();
856 if(length16 != 0) {
857 char stackBuffer[1024];
858 int32_t capacity = (int32_t)sizeof(stackBuffer);
859 UBool utf8IsOwned = FALSE;
860 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
861 3*length16,
862 stackBuffer, capacity,
863 &capacity);
864 int32_t length8 = 0;
865 UErrorCode errorCode = U_ZERO_ERROR;
866 u_strToUTF8WithSub(utf8, capacity, &length8,
867 getBuffer(), length16,
868 0xFFFD, // Standard substitution character.
869 NULL, // Don't care about number of substitutions.
870 &errorCode);
871 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
872 utf8 = (char *)uprv_malloc(length8);
873 if(utf8 != NULL) {
874 utf8IsOwned = TRUE;
875 errorCode = U_ZERO_ERROR;
876 u_strToUTF8WithSub(utf8, length8, &length8,
877 getBuffer(), length16,
878 0xFFFD, // Standard substitution character.
879 NULL, // Don't care about number of substitutions.
880 &errorCode);
881 } else {
882 errorCode = U_MEMORY_ALLOCATION_ERROR;
883 }
884 }
885 if(U_SUCCESS(errorCode)) {
886 sink.Append(utf8, length8);
887 sink.Flush();
888 }
889 if(utf8IsOwned) {
890 uprv_free(utf8);
891 }
892 }
893 }
894
895 int32_t
toUTF32(UChar32 * utf32,int32_t capacity,UErrorCode & errorCode) const896 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
897 int32_t length32=0;
898 if(U_SUCCESS(errorCode)) {
899 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
900 u_strToUTF32WithSub(utf32, capacity, &length32,
901 getBuffer(), length(),
902 0xfffd, // Substitution character.
903 NULL, // Don't care about number of substitutions.
904 &errorCode);
905 }
906 return length32;
907 }
908
909 int32_t
indexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const910 UnicodeString::indexOf(const UChar *srcChars,
911 int32_t srcStart,
912 int32_t srcLength,
913 int32_t start,
914 int32_t length) const
915 {
916 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
917 return -1;
918 }
919
920 // UnicodeString does not find empty substrings
921 if(srcLength < 0 && srcChars[srcStart] == 0) {
922 return -1;
923 }
924
925 // get the indices within bounds
926 pinIndices(start, length);
927
928 // find the first occurrence of the substring
929 const UChar *array = getArrayStart();
930 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
931 if(match == NULL) {
932 return -1;
933 } else {
934 return (int32_t)(match - array);
935 }
936 }
937
938 int32_t
doIndexOf(UChar c,int32_t start,int32_t length) const939 UnicodeString::doIndexOf(UChar c,
940 int32_t start,
941 int32_t length) const
942 {
943 // pin indices
944 pinIndices(start, length);
945
946 // find the first occurrence of c
947 const UChar *array = getArrayStart();
948 const UChar *match = u_memchr(array + start, c, length);
949 if(match == NULL) {
950 return -1;
951 } else {
952 return (int32_t)(match - array);
953 }
954 }
955
956 int32_t
doIndexOf(UChar32 c,int32_t start,int32_t length) const957 UnicodeString::doIndexOf(UChar32 c,
958 int32_t start,
959 int32_t length) const {
960 // pin indices
961 pinIndices(start, length);
962
963 // find the first occurrence of c
964 const UChar *array = getArrayStart();
965 const UChar *match = u_memchr32(array + start, c, length);
966 if(match == NULL) {
967 return -1;
968 } else {
969 return (int32_t)(match - array);
970 }
971 }
972
973 int32_t
lastIndexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const974 UnicodeString::lastIndexOf(const UChar *srcChars,
975 int32_t srcStart,
976 int32_t srcLength,
977 int32_t start,
978 int32_t length) const
979 {
980 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
981 return -1;
982 }
983
984 // UnicodeString does not find empty substrings
985 if(srcLength < 0 && srcChars[srcStart] == 0) {
986 return -1;
987 }
988
989 // get the indices within bounds
990 pinIndices(start, length);
991
992 // find the last occurrence of the substring
993 const UChar *array = getArrayStart();
994 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
995 if(match == NULL) {
996 return -1;
997 } else {
998 return (int32_t)(match - array);
999 }
1000 }
1001
1002 int32_t
doLastIndexOf(UChar c,int32_t start,int32_t length) const1003 UnicodeString::doLastIndexOf(UChar c,
1004 int32_t start,
1005 int32_t length) const
1006 {
1007 if(isBogus()) {
1008 return -1;
1009 }
1010
1011 // pin indices
1012 pinIndices(start, length);
1013
1014 // find the last occurrence of c
1015 const UChar *array = getArrayStart();
1016 const UChar *match = u_memrchr(array + start, c, length);
1017 if(match == NULL) {
1018 return -1;
1019 } else {
1020 return (int32_t)(match - array);
1021 }
1022 }
1023
1024 int32_t
doLastIndexOf(UChar32 c,int32_t start,int32_t length) const1025 UnicodeString::doLastIndexOf(UChar32 c,
1026 int32_t start,
1027 int32_t length) const {
1028 // pin indices
1029 pinIndices(start, length);
1030
1031 // find the last occurrence of c
1032 const UChar *array = getArrayStart();
1033 const UChar *match = u_memrchr32(array + start, c, length);
1034 if(match == NULL) {
1035 return -1;
1036 } else {
1037 return (int32_t)(match - array);
1038 }
1039 }
1040
1041 //========================================
1042 // Write implementation
1043 //========================================
1044
1045 UnicodeString&
findAndReplace(int32_t start,int32_t length,const UnicodeString & oldText,int32_t oldStart,int32_t oldLength,const UnicodeString & newText,int32_t newStart,int32_t newLength)1046 UnicodeString::findAndReplace(int32_t start,
1047 int32_t length,
1048 const UnicodeString& oldText,
1049 int32_t oldStart,
1050 int32_t oldLength,
1051 const UnicodeString& newText,
1052 int32_t newStart,
1053 int32_t newLength)
1054 {
1055 if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1056 return *this;
1057 }
1058
1059 pinIndices(start, length);
1060 oldText.pinIndices(oldStart, oldLength);
1061 newText.pinIndices(newStart, newLength);
1062
1063 if(oldLength == 0) {
1064 return *this;
1065 }
1066
1067 while(length > 0 && length >= oldLength) {
1068 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1069 if(pos < 0) {
1070 // no more oldText's here: done
1071 break;
1072 } else {
1073 // we found oldText, replace it by newText and go beyond it
1074 replace(pos, oldLength, newText, newStart, newLength);
1075 length -= pos + oldLength - start;
1076 start = pos + newLength;
1077 }
1078 }
1079
1080 return *this;
1081 }
1082
1083
1084 void
setToBogus()1085 UnicodeString::setToBogus()
1086 {
1087 releaseArray();
1088
1089 fUnion.fFields.fLengthAndFlags = kIsBogus;
1090 fUnion.fFields.fArray = 0;
1091 fUnion.fFields.fCapacity = 0;
1092 }
1093
1094 // turn a bogus string into an empty one
1095 void
unBogus()1096 UnicodeString::unBogus() {
1097 if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1098 setToEmpty();
1099 }
1100 }
1101
1102 const UChar *
getTerminatedBuffer()1103 UnicodeString::getTerminatedBuffer() {
1104 if(!isWritable()) {
1105 return 0;
1106 }
1107 UChar *array = getArrayStart();
1108 int32_t len = length();
1109 if(len < getCapacity()) {
1110 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1111 // If len<capacity on a read-only alias, then array[len] is
1112 // either the original NUL (if constructed with (TRUE, s, length))
1113 // or one of the original string contents characters (if later truncated),
1114 // therefore we can assume that array[len] is initialized memory.
1115 if(array[len] == 0) {
1116 return array;
1117 }
1118 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1119 // kRefCounted: Do not write the NUL if the buffer is shared.
1120 // That is mostly safe, except when the length of one copy was modified
1121 // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1122 // Then the NUL would be written into the middle of another copy's string.
1123
1124 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1125 // Do not test if there is a NUL already because it might be uninitialized memory.
1126 // (That would be safe, but tools like valgrind & Purify would complain.)
1127 array[len] = 0;
1128 return array;
1129 }
1130 }
1131 if(cloneArrayIfNeeded(len+1)) {
1132 array = getArrayStart();
1133 array[len] = 0;
1134 return array;
1135 } else {
1136 return NULL;
1137 }
1138 }
1139
1140 // setTo() analogous to the readonly-aliasing constructor with the same signature
1141 UnicodeString &
setTo(UBool isTerminated,const UChar * text,int32_t textLength)1142 UnicodeString::setTo(UBool isTerminated,
1143 const UChar *text,
1144 int32_t textLength)
1145 {
1146 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1147 // do not modify a string that has an "open" getBuffer(minCapacity)
1148 return *this;
1149 }
1150
1151 if(text == NULL) {
1152 // treat as an empty string, do not alias
1153 releaseArray();
1154 setToEmpty();
1155 return *this;
1156 }
1157
1158 if( textLength < -1 ||
1159 (textLength == -1 && !isTerminated) ||
1160 (textLength >= 0 && isTerminated && text[textLength] != 0)
1161 ) {
1162 setToBogus();
1163 return *this;
1164 }
1165
1166 releaseArray();
1167
1168 if(textLength == -1) {
1169 // text is terminated, or else it would have failed the above test
1170 textLength = u_strlen(text);
1171 }
1172 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1173 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1174 return *this;
1175 }
1176
1177 // setTo() analogous to the writable-aliasing constructor with the same signature
1178 UnicodeString &
setTo(UChar * buffer,int32_t buffLength,int32_t buffCapacity)1179 UnicodeString::setTo(UChar *buffer,
1180 int32_t buffLength,
1181 int32_t buffCapacity) {
1182 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1183 // do not modify a string that has an "open" getBuffer(minCapacity)
1184 return *this;
1185 }
1186
1187 if(buffer == NULL) {
1188 // treat as an empty string, do not alias
1189 releaseArray();
1190 setToEmpty();
1191 return *this;
1192 }
1193
1194 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1195 setToBogus();
1196 return *this;
1197 } else if(buffLength == -1) {
1198 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1199 const UChar *p = buffer, *limit = buffer + buffCapacity;
1200 while(p != limit && *p != 0) {
1201 ++p;
1202 }
1203 buffLength = (int32_t)(p - buffer);
1204 }
1205
1206 releaseArray();
1207
1208 fUnion.fFields.fLengthAndFlags = kWritableAlias;
1209 setArray(buffer, buffLength, buffCapacity);
1210 return *this;
1211 }
1212
setToUTF8(const StringPiece & utf8)1213 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
1214 unBogus();
1215 int32_t length = utf8.length();
1216 int32_t capacity;
1217 // The UTF-16 string will be at most as long as the UTF-8 string.
1218 if(length <= US_STACKBUF_SIZE) {
1219 capacity = US_STACKBUF_SIZE;
1220 } else {
1221 capacity = length + 1; // +1 for the terminating NUL.
1222 }
1223 UChar *utf16 = getBuffer(capacity);
1224 int32_t length16;
1225 UErrorCode errorCode = U_ZERO_ERROR;
1226 u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1227 utf8.data(), length,
1228 0xfffd, // Substitution character.
1229 NULL, // Don't care about number of substitutions.
1230 &errorCode);
1231 releaseBuffer(length16);
1232 if(U_FAILURE(errorCode)) {
1233 setToBogus();
1234 }
1235 return *this;
1236 }
1237
1238 UnicodeString&
setCharAt(int32_t offset,UChar c)1239 UnicodeString::setCharAt(int32_t offset,
1240 UChar c)
1241 {
1242 int32_t len = length();
1243 if(cloneArrayIfNeeded() && len > 0) {
1244 if(offset < 0) {
1245 offset = 0;
1246 } else if(offset >= len) {
1247 offset = len - 1;
1248 }
1249
1250 getArrayStart()[offset] = c;
1251 }
1252 return *this;
1253 }
1254
1255 UnicodeString&
replace(int32_t start,int32_t _length,UChar32 srcChar)1256 UnicodeString::replace(int32_t start,
1257 int32_t _length,
1258 UChar32 srcChar) {
1259 UChar buffer[U16_MAX_LENGTH];
1260 int32_t count = 0;
1261 UBool isError = FALSE;
1262 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1263 // We test isError so that the compiler does not complain that we don't.
1264 // If isError (srcChar is not a valid code point) then count==0 which means
1265 // we remove the source segment rather than replacing it with srcChar.
1266 return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1267 }
1268
1269 UnicodeString&
append(UChar32 srcChar)1270 UnicodeString::append(UChar32 srcChar) {
1271 UChar buffer[U16_MAX_LENGTH];
1272 int32_t _length = 0;
1273 UBool isError = FALSE;
1274 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1275 // We test isError so that the compiler does not complain that we don't.
1276 // If isError then _length==0 which turns the doReplace() into a no-op anyway.
1277 return isError ? *this : doReplace(length(), 0, buffer, 0, _length);
1278 }
1279
1280 UnicodeString&
doReplace(int32_t start,int32_t length,const UnicodeString & src,int32_t srcStart,int32_t srcLength)1281 UnicodeString::doReplace( int32_t start,
1282 int32_t length,
1283 const UnicodeString& src,
1284 int32_t srcStart,
1285 int32_t srcLength)
1286 {
1287 if(!src.isBogus()) {
1288 // pin the indices to legal values
1289 src.pinIndices(srcStart, srcLength);
1290
1291 // get the characters from src
1292 // and replace the range in ourselves with them
1293 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1294 } else {
1295 // remove the range
1296 return doReplace(start, length, 0, 0, 0);
1297 }
1298 }
1299
1300 UnicodeString&
doReplace(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength)1301 UnicodeString::doReplace(int32_t start,
1302 int32_t length,
1303 const UChar *srcChars,
1304 int32_t srcStart,
1305 int32_t srcLength)
1306 {
1307 if(!isWritable()) {
1308 return *this;
1309 }
1310
1311 int32_t oldLength = this->length();
1312
1313 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1314 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1315 if(start == 0) {
1316 // remove prefix by adjusting the array pointer
1317 pinIndex(length);
1318 fUnion.fFields.fArray += length;
1319 fUnion.fFields.fCapacity -= length;
1320 setLength(oldLength - length);
1321 return *this;
1322 } else {
1323 pinIndex(start);
1324 if(length >= (oldLength - start)) {
1325 // remove suffix by reducing the length (like truncate())
1326 setLength(start);
1327 fUnion.fFields.fCapacity = start; // not NUL-terminated any more
1328 return *this;
1329 }
1330 }
1331 }
1332
1333 if(srcChars == 0) {
1334 srcStart = srcLength = 0;
1335 } else if(srcLength < 0) {
1336 // get the srcLength if necessary
1337 srcLength = u_strlen(srcChars + srcStart);
1338 }
1339
1340 // calculate the size of the string after the replace
1341 int32_t newLength;
1342
1343 // optimize append() onto a large-enough, owned string
1344 if(start >= oldLength) {
1345 if(srcLength == 0) {
1346 return *this;
1347 }
1348 newLength = oldLength + srcLength;
1349 if(newLength <= getCapacity() && isBufferWritable()) {
1350 UChar *oldArray = getArrayStart();
1351 // Do not copy characters when
1352 // UChar *buffer=str.getAppendBuffer(...);
1353 // is followed by
1354 // str.append(buffer, length);
1355 // or
1356 // str.appendString(buffer, length)
1357 // or similar.
1358 if(srcChars + srcStart != oldArray + start || start > oldLength) {
1359 us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength);
1360 }
1361 setLength(newLength);
1362 return *this;
1363 } else {
1364 // pin the indices to legal values
1365 start = oldLength;
1366 length = 0;
1367 }
1368 } else {
1369 // pin the indices to legal values
1370 pinIndices(start, length);
1371
1372 newLength = oldLength - length + srcLength;
1373 }
1374
1375 // the following may change fArray but will not copy the current contents;
1376 // therefore we need to keep the current fArray
1377 UChar oldStackBuffer[US_STACKBUF_SIZE];
1378 UChar *oldArray;
1379 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1380 // copy the stack buffer contents because it will be overwritten with
1381 // fUnion.fFields values
1382 u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
1383 oldArray = oldStackBuffer;
1384 } else {
1385 oldArray = getArrayStart();
1386 }
1387
1388 // clone our array and allocate a bigger array if needed
1389 int32_t *bufferToDelete = 0;
1390 if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
1391 FALSE, &bufferToDelete)
1392 ) {
1393 return *this;
1394 }
1395
1396 // now do the replace
1397
1398 UChar *newArray = getArrayStart();
1399 if(newArray != oldArray) {
1400 // if fArray changed, then we need to copy everything except what will change
1401 us_arrayCopy(oldArray, 0, newArray, 0, start);
1402 us_arrayCopy(oldArray, start + length,
1403 newArray, start + srcLength,
1404 oldLength - (start + length));
1405 } else if(length != srcLength) {
1406 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1407 us_arrayCopy(oldArray, start + length,
1408 newArray, start + srcLength,
1409 oldLength - (start + length));
1410 }
1411
1412 // now fill in the hole with the new string
1413 us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1414
1415 setLength(newLength);
1416
1417 // delayed delete in case srcChars == fArray when we started, and
1418 // to keep oldArray alive for the above operations
1419 if (bufferToDelete) {
1420 uprv_free(bufferToDelete);
1421 }
1422
1423 return *this;
1424 }
1425
1426 /**
1427 * Replaceable API
1428 */
1429 void
handleReplaceBetween(int32_t start,int32_t limit,const UnicodeString & text)1430 UnicodeString::handleReplaceBetween(int32_t start,
1431 int32_t limit,
1432 const UnicodeString& text) {
1433 replaceBetween(start, limit, text);
1434 }
1435
1436 /**
1437 * Replaceable API
1438 */
1439 void
copy(int32_t start,int32_t limit,int32_t dest)1440 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1441 if (limit <= start) {
1442 return; // Nothing to do; avoid bogus malloc call
1443 }
1444 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1445 // Check to make sure text is not null.
1446 if (text != NULL) {
1447 extractBetween(start, limit, text, 0);
1448 insert(dest, text, 0, limit - start);
1449 uprv_free(text);
1450 }
1451 }
1452
1453 /**
1454 * Replaceable API
1455 *
1456 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1457 * so we implement this function here.
1458 */
hasMetaData() const1459 UBool Replaceable::hasMetaData() const {
1460 return TRUE;
1461 }
1462
1463 /**
1464 * Replaceable API
1465 */
hasMetaData() const1466 UBool UnicodeString::hasMetaData() const {
1467 return FALSE;
1468 }
1469
1470 UnicodeString&
doReverse(int32_t start,int32_t length)1471 UnicodeString::doReverse(int32_t start, int32_t length) {
1472 if(length <= 1 || !cloneArrayIfNeeded()) {
1473 return *this;
1474 }
1475
1476 // pin the indices to legal values
1477 pinIndices(start, length);
1478 if(length <= 1) { // pinIndices() might have shrunk the length
1479 return *this;
1480 }
1481
1482 UChar *left = getArrayStart() + start;
1483 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2)
1484 UChar swap;
1485 UBool hasSupplementary = FALSE;
1486
1487 // Before the loop we know left<right because length>=2.
1488 do {
1489 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1490 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1491 *right-- = swap;
1492 } while(left < right);
1493 // Make sure to test the middle code unit of an odd-length string.
1494 // Redundant if the length is even.
1495 hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1496
1497 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1498 if(hasSupplementary) {
1499 UChar swap2;
1500
1501 left = getArrayStart() + start;
1502 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1503 while(left < right) {
1504 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1505 *left++ = swap2;
1506 *left++ = swap;
1507 } else {
1508 ++left;
1509 }
1510 }
1511 }
1512
1513 return *this;
1514 }
1515
1516 UBool
padLeading(int32_t targetLength,UChar padChar)1517 UnicodeString::padLeading(int32_t targetLength,
1518 UChar padChar)
1519 {
1520 int32_t oldLength = length();
1521 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1522 return FALSE;
1523 } else {
1524 // move contents up by padding width
1525 UChar *array = getArrayStart();
1526 int32_t start = targetLength - oldLength;
1527 us_arrayCopy(array, 0, array, start, oldLength);
1528
1529 // fill in padding character
1530 while(--start >= 0) {
1531 array[start] = padChar;
1532 }
1533 setLength(targetLength);
1534 return TRUE;
1535 }
1536 }
1537
1538 UBool
padTrailing(int32_t targetLength,UChar padChar)1539 UnicodeString::padTrailing(int32_t targetLength,
1540 UChar padChar)
1541 {
1542 int32_t oldLength = length();
1543 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1544 return FALSE;
1545 } else {
1546 // fill in padding character
1547 UChar *array = getArrayStart();
1548 int32_t length = targetLength;
1549 while(--length >= oldLength) {
1550 array[length] = padChar;
1551 }
1552 setLength(targetLength);
1553 return TRUE;
1554 }
1555 }
1556
1557 //========================================
1558 // Hashing
1559 //========================================
1560 int32_t
doHashCode() const1561 UnicodeString::doHashCode() const
1562 {
1563 /* Delegate hash computation to uhash. This makes UnicodeString
1564 * hashing consistent with UChar* hashing. */
1565 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1566 if (hashCode == kInvalidHashCode) {
1567 hashCode = kEmptyHashCode;
1568 }
1569 return hashCode;
1570 }
1571
1572 //========================================
1573 // External Buffer
1574 //========================================
1575
1576 UChar *
getBuffer(int32_t minCapacity)1577 UnicodeString::getBuffer(int32_t minCapacity) {
1578 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1579 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1580 setZeroLength();
1581 return getArrayStart();
1582 } else {
1583 return 0;
1584 }
1585 }
1586
1587 void
releaseBuffer(int32_t newLength)1588 UnicodeString::releaseBuffer(int32_t newLength) {
1589 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1590 // set the new fLength
1591 int32_t capacity=getCapacity();
1592 if(newLength==-1) {
1593 // the new length is the string length, capped by fCapacity
1594 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1595 while(p<limit && *p!=0) {
1596 ++p;
1597 }
1598 newLength=(int32_t)(p-array);
1599 } else if(newLength>capacity) {
1600 newLength=capacity;
1601 }
1602 setLength(newLength);
1603 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1604 }
1605 }
1606
1607 //========================================
1608 // Miscellaneous
1609 //========================================
1610 UBool
cloneArrayIfNeeded(int32_t newCapacity,int32_t growCapacity,UBool doCopyArray,int32_t ** pBufferToDelete,UBool forceClone)1611 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1612 int32_t growCapacity,
1613 UBool doCopyArray,
1614 int32_t **pBufferToDelete,
1615 UBool forceClone) {
1616 // default parameters need to be static, therefore
1617 // the defaults are -1 to have convenience defaults
1618 if(newCapacity == -1) {
1619 newCapacity = getCapacity();
1620 }
1621
1622 // while a getBuffer(minCapacity) is "open",
1623 // prevent any modifications of the string by returning FALSE here
1624 // if the string is bogus, then only an assignment or similar can revive it
1625 if(!isWritable()) {
1626 return FALSE;
1627 }
1628
1629 /*
1630 * We need to make a copy of the array if
1631 * the buffer is read-only, or
1632 * the buffer is refCounted (shared), and refCount>1, or
1633 * the buffer is too small.
1634 * Return FALSE if memory could not be allocated.
1635 */
1636 if(forceClone ||
1637 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1638 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1639 newCapacity > getCapacity()
1640 ) {
1641 // check growCapacity for default value and use of the stack buffer
1642 if(growCapacity < 0) {
1643 growCapacity = newCapacity;
1644 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1645 growCapacity = US_STACKBUF_SIZE;
1646 }
1647
1648 // save old values
1649 UChar oldStackBuffer[US_STACKBUF_SIZE];
1650 UChar *oldArray;
1651 int32_t oldLength = length();
1652 int16_t flags = fUnion.fFields.fLengthAndFlags;
1653
1654 if(flags&kUsingStackBuffer) {
1655 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1656 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1657 // copy the stack buffer contents because it will be overwritten with
1658 // fUnion.fFields values
1659 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1660 oldArray = oldStackBuffer;
1661 } else {
1662 oldArray = NULL; // no need to copy from the stack buffer to itself
1663 }
1664 } else {
1665 oldArray = fUnion.fFields.fArray;
1666 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1667 }
1668
1669 // allocate a new array
1670 if(allocate(growCapacity) ||
1671 (newCapacity < growCapacity && allocate(newCapacity))
1672 ) {
1673 if(doCopyArray) {
1674 // copy the contents
1675 // do not copy more than what fits - it may be smaller than before
1676 int32_t minLength = oldLength;
1677 newCapacity = getCapacity();
1678 if(newCapacity < minLength) {
1679 minLength = newCapacity;
1680 }
1681 if(oldArray != NULL) {
1682 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1683 }
1684 setLength(minLength);
1685 } else {
1686 setZeroLength();
1687 }
1688
1689 // release the old array
1690 if(flags & kRefCounted) {
1691 // the array is refCounted; decrement and release if 0
1692 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1693 if(umtx_atomic_dec(pRefCount) == 0) {
1694 if(pBufferToDelete == 0) {
1695 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1696 // is defined as volatile. (Volatile has useful non-standard behavior
1697 // with this compiler.)
1698 uprv_free((void *)pRefCount);
1699 } else {
1700 // the caller requested to delete it himself
1701 *pBufferToDelete = (int32_t *)pRefCount;
1702 }
1703 }
1704 }
1705 } else {
1706 // not enough memory for growCapacity and not even for the smaller newCapacity
1707 // reset the old values for setToBogus() to release the array
1708 if(!(flags&kUsingStackBuffer)) {
1709 fUnion.fFields.fArray = oldArray;
1710 }
1711 fUnion.fFields.fLengthAndFlags = flags;
1712 setToBogus();
1713 return FALSE;
1714 }
1715 }
1716 return TRUE;
1717 }
1718
1719 // UnicodeStringAppendable ------------------------------------------------- ***
1720
~UnicodeStringAppendable()1721 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1722
1723 UBool
appendCodeUnit(UChar c)1724 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1725 return str.doReplace(str.length(), 0, &c, 0, 1).isWritable();
1726 }
1727
1728 UBool
appendCodePoint(UChar32 c)1729 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1730 UChar buffer[U16_MAX_LENGTH];
1731 int32_t cLength = 0;
1732 UBool isError = FALSE;
1733 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1734 return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable();
1735 }
1736
1737 UBool
appendString(const UChar * s,int32_t length)1738 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1739 return str.doReplace(str.length(), 0, s, 0, length).isWritable();
1740 }
1741
1742 UBool
reserveAppendCapacity(int32_t appendCapacity)1743 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1744 return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1745 }
1746
1747 UChar *
getAppendBuffer(int32_t minCapacity,int32_t desiredCapacityHint,UChar * scratch,int32_t scratchCapacity,int32_t * resultCapacity)1748 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1749 int32_t desiredCapacityHint,
1750 UChar *scratch, int32_t scratchCapacity,
1751 int32_t *resultCapacity) {
1752 if(minCapacity < 1 || scratchCapacity < minCapacity) {
1753 *resultCapacity = 0;
1754 return NULL;
1755 }
1756 int32_t oldLength = str.length();
1757 if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1758 *resultCapacity = str.getCapacity() - oldLength;
1759 return str.getArrayStart() + oldLength;
1760 }
1761 *resultCapacity = scratchCapacity;
1762 return scratch;
1763 }
1764
1765 U_NAMESPACE_END
1766
1767 U_NAMESPACE_USE
1768
1769 U_CAPI int32_t U_EXPORT2
uhash_hashUnicodeString(const UElement key)1770 uhash_hashUnicodeString(const UElement key) {
1771 const UnicodeString *str = (const UnicodeString*) key.pointer;
1772 return (str == NULL) ? 0 : str->hashCode();
1773 }
1774
1775 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1776 // does not depend on hashtable code.
1777 U_CAPI UBool U_EXPORT2
uhash_compareUnicodeString(const UElement key1,const UElement key2)1778 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1779 const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1780 const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1781 if (str1 == str2) {
1782 return TRUE;
1783 }
1784 if (str1 == NULL || str2 == NULL) {
1785 return FALSE;
1786 }
1787 return *str1 == *str2;
1788 }
1789
1790 #ifdef U_STATIC_IMPLEMENTATION
1791 /*
1792 This should never be called. It is defined here to make sure that the
1793 virtual vector deleting destructor is defined within unistr.cpp.
1794 The vector deleting destructor is already a part of UObject,
1795 but defining it here makes sure that it is included with this object file.
1796 This makes sure that static library dependencies are kept to a minimum.
1797 */
uprv_UnicodeStringDummy(void)1798 static void uprv_UnicodeStringDummy(void) {
1799 delete [] (new UnicodeString[2]);
1800 }
1801 #endif
1802