1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ******************************************************************************
8 *
9 * File unistr.cpp
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   09/25/98    stephen     Creation.
15 *   04/20/99    stephen     Overhauled per 4/16 code review.
16 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
17 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
18 *                           Replaceable.
19 *   06/25/01    grhoten     Removed the dependency on iostream
20 ******************************************************************************
21 */
22 
23 #include "unicode/utypes.h"
24 #include "unicode/appendable.h"
25 #include "unicode/putil.h"
26 #include "cstring.h"
27 #include "cmemory.h"
28 #include "unicode/ustring.h"
29 #include "unicode/unistr.h"
30 #include "unicode/utf.h"
31 #include "unicode/utf16.h"
32 #include "uelement.h"
33 #include "ustr_imp.h"
34 #include "umutex.h"
35 #include "uassert.h"
36 
37 #if 0
38 
39 #include <iostream>
40 using namespace std;
41 
42 //DEBUGGING
43 void
44 print(const UnicodeString& s,
45       const char *name)
46 {
47   UChar c;
48   cout << name << ":|";
49   for(int i = 0; i < s.length(); ++i) {
50     c = s[i];
51     if(c>= 0x007E || c < 0x0020)
52       cout << "[0x" << hex << s[i] << "]";
53     else
54       cout << (char) s[i];
55   }
56   cout << '|' << endl;
57 }
58 
59 void
60 print(const UChar *s,
61       int32_t len,
62       const char *name)
63 {
64   UChar c;
65   cout << name << ":|";
66   for(int i = 0; i < len; ++i) {
67     c = s[i];
68     if(c>= 0x007E || c < 0x0020)
69       cout << "[0x" << hex << s[i] << "]";
70     else
71       cout << (char) s[i];
72   }
73   cout << '|' << endl;
74 }
75 // END DEBUGGING
76 #endif
77 
78 // Local function definitions for now
79 
80 // need to copy areas that may overlap
81 static
82 inline void
us_arrayCopy(const UChar * src,int32_t srcStart,UChar * dst,int32_t dstStart,int32_t count)83 us_arrayCopy(const UChar *src, int32_t srcStart,
84          UChar *dst, int32_t dstStart, int32_t count)
85 {
86   if(count>0) {
87     uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
88   }
89 }
90 
91 // u_unescapeAt() callback to get a UChar from a UnicodeString
92 U_CDECL_BEGIN
93 static UChar U_CALLCONV
UnicodeString_charAt(int32_t offset,void * context)94 UnicodeString_charAt(int32_t offset, void *context) {
95     return ((icu::UnicodeString*) context)->charAt(offset);
96 }
97 U_CDECL_END
98 
99 U_NAMESPACE_BEGIN
100 
101 /* The Replaceable virtual destructor can't be defined in the header
102    due to how AIX works with multiple definitions of virtual functions.
103 */
~Replaceable()104 Replaceable::~Replaceable() {}
105 
106 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
107 
108 UnicodeString U_EXPORT2
109 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
110     return
111         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
112             append(s1).
113                 append(s2);
114 }
115 
116 //========================================
117 // Reference Counting functions, put at top of file so that optimizing compilers
118 //                               have a chance to automatically inline.
119 //========================================
120 
121 void
addRef()122 UnicodeString::addRef() {
123   umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
124 }
125 
126 int32_t
removeRef()127 UnicodeString::removeRef() {
128   return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
129 }
130 
131 int32_t
refCount() const132 UnicodeString::refCount() const {
133   return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
134 }
135 
136 void
releaseArray()137 UnicodeString::releaseArray() {
138   if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
139     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
140   }
141 }
142 
143 
144 
145 //========================================
146 // Constructors
147 //========================================
148 
149 // The default constructor is inline in unistr.h.
150 
UnicodeString(int32_t capacity,UChar32 c,int32_t count)151 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
152   fUnion.fFields.fLengthAndFlags = 0;
153   if(count <= 0 || (uint32_t)c > 0x10ffff) {
154     // just allocate and do not do anything else
155     allocate(capacity);
156   } else if(c <= 0xffff) {
157     int32_t length = count;
158     if(capacity < length) {
159       capacity = length;
160     }
161     if(allocate(capacity)) {
162       UChar *array = getArrayStart();
163       UChar unit = (UChar)c;
164       for(int32_t i = 0; i < length; ++i) {
165         array[i] = unit;
166       }
167       setLength(length);
168     }
169   } else {  // supplementary code point, write surrogate pairs
170     if(count > (INT32_MAX / 2)) {
171       // We would get more than 2G UChars.
172       allocate(capacity);
173       return;
174     }
175     int32_t length = count * 2;
176     if(capacity < length) {
177       capacity = length;
178     }
179     if(allocate(capacity)) {
180       UChar *array = getArrayStart();
181       UChar lead = U16_LEAD(c);
182       UChar trail = U16_TRAIL(c);
183       for(int32_t i = 0; i < length; i += 2) {
184         array[i] = lead;
185         array[i + 1] = trail;
186       }
187       setLength(length);
188     }
189   }
190 }
191 
UnicodeString(UChar ch)192 UnicodeString::UnicodeString(UChar ch) {
193   fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194   fUnion.fStackFields.fBuffer[0] = ch;
195 }
196 
UnicodeString(UChar32 ch)197 UnicodeString::UnicodeString(UChar32 ch) {
198   fUnion.fFields.fLengthAndFlags = kShortString;
199   int32_t i = 0;
200   UBool isError = FALSE;
201   U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202   // We test isError so that the compiler does not complain that we don't.
203   // If isError then i==0 which is what we want anyway.
204   if(!isError) {
205     setShortLength(i);
206   }
207 }
208 
UnicodeString(const UChar * text)209 UnicodeString::UnicodeString(const UChar *text) {
210   fUnion.fFields.fLengthAndFlags = kShortString;
211   doAppend(text, 0, -1);
212 }
213 
UnicodeString(const UChar * text,int32_t textLength)214 UnicodeString::UnicodeString(const UChar *text,
215                              int32_t textLength) {
216   fUnion.fFields.fLengthAndFlags = kShortString;
217   doAppend(text, 0, textLength);
218 }
219 
UnicodeString(UBool isTerminated,ConstChar16Ptr textPtr,int32_t textLength)220 UnicodeString::UnicodeString(UBool isTerminated,
221                              ConstChar16Ptr textPtr,
222                              int32_t textLength) {
223   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224   const UChar *text = textPtr;
225   if(text == NULL) {
226     // treat as an empty string, do not alias
227     setToEmpty();
228   } else if(textLength < -1 ||
229             (textLength == -1 && !isTerminated) ||
230             (textLength >= 0 && isTerminated && text[textLength] != 0)
231   ) {
232     setToBogus();
233   } else {
234     if(textLength == -1) {
235       // text is terminated, or else it would have failed the above test
236       textLength = u_strlen(text);
237     }
238     setArray(const_cast<UChar *>(text), textLength,
239              isTerminated ? textLength + 1 : textLength);
240   }
241 }
242 
UnicodeString(UChar * buff,int32_t buffLength,int32_t buffCapacity)243 UnicodeString::UnicodeString(UChar *buff,
244                              int32_t buffLength,
245                              int32_t buffCapacity) {
246   fUnion.fFields.fLengthAndFlags = kWritableAlias;
247   if(buff == NULL) {
248     // treat as an empty string, do not alias
249     setToEmpty();
250   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
251     setToBogus();
252   } else {
253     if(buffLength == -1) {
254       // fLength = u_strlen(buff); but do not look beyond buffCapacity
255       const UChar *p = buff, *limit = buff + buffCapacity;
256       while(p != limit && *p != 0) {
257         ++p;
258       }
259       buffLength = (int32_t)(p - buff);
260     }
261     setArray(buff, buffLength, buffCapacity);
262   }
263 }
264 
UnicodeString(const char * src,int32_t length,EInvariant)265 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
266   fUnion.fFields.fLengthAndFlags = kShortString;
267   if(src==NULL) {
268     // treat as an empty string
269   } else {
270     if(length<0) {
271       length=(int32_t)uprv_strlen(src);
272     }
273     if(cloneArrayIfNeeded(length, length, FALSE)) {
274       u_charsToUChars(src, getArrayStart(), length);
275       setLength(length);
276     } else {
277       setToBogus();
278     }
279   }
280 }
281 
282 #if U_CHARSET_IS_UTF8
283 
UnicodeString(const char * codepageData)284 UnicodeString::UnicodeString(const char *codepageData) {
285   fUnion.fFields.fLengthAndFlags = kShortString;
286   if(codepageData != 0) {
287     setToUTF8(codepageData);
288   }
289 }
290 
UnicodeString(const char * codepageData,int32_t dataLength)291 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
292   fUnion.fFields.fLengthAndFlags = kShortString;
293   // if there's nothing to convert, do nothing
294   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
295     return;
296   }
297   if(dataLength == -1) {
298     dataLength = (int32_t)uprv_strlen(codepageData);
299   }
300   setToUTF8(StringPiece(codepageData, dataLength));
301 }
302 
303 // else see unistr_cnv.cpp
304 #endif
305 
UnicodeString(const UnicodeString & that)306 UnicodeString::UnicodeString(const UnicodeString& that) {
307   fUnion.fFields.fLengthAndFlags = kShortString;
308   copyFrom(that);
309 }
310 
UnicodeString(UnicodeString && src)311 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
312   fUnion.fFields.fLengthAndFlags = kShortString;
313   moveFrom(src);
314 }
315 
UnicodeString(const UnicodeString & that,int32_t srcStart)316 UnicodeString::UnicodeString(const UnicodeString& that,
317                              int32_t srcStart) {
318   fUnion.fFields.fLengthAndFlags = kShortString;
319   setTo(that, srcStart);
320 }
321 
UnicodeString(const UnicodeString & that,int32_t srcStart,int32_t srcLength)322 UnicodeString::UnicodeString(const UnicodeString& that,
323                              int32_t srcStart,
324                              int32_t srcLength) {
325   fUnion.fFields.fLengthAndFlags = kShortString;
326   setTo(that, srcStart, srcLength);
327 }
328 
329 // Replaceable base class clone() default implementation, does not clone
330 Replaceable *
clone() const331 Replaceable::clone() const {
332   return NULL;
333 }
334 
335 // UnicodeString overrides clone() with a real implementation
336 Replaceable *
clone() const337 UnicodeString::clone() const {
338   return new UnicodeString(*this);
339 }
340 
341 //========================================
342 // array allocation
343 //========================================
344 
345 namespace {
346 
347 const int32_t kGrowSize = 128;
348 
349 // The number of bytes for one int32_t reference counter and capacity UChars
350 // must fit into a 32-bit size_t (at least when on a 32-bit platform).
351 // We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
352 // and round up to a multiple of 16 bytes.
353 // This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
354 // (With more complicated checks we could go up to 0x7ffffffd without rounding up,
355 // but that does not seem worth it.)
356 const int32_t kMaxCapacity = 0x7ffffff5;
357 
getGrowCapacity(int32_t newLength)358 int32_t getGrowCapacity(int32_t newLength) {
359   int32_t growSize = (newLength >> 2) + kGrowSize;
360   if(growSize <= (kMaxCapacity - newLength)) {
361     return newLength + growSize;
362   } else {
363     return kMaxCapacity;
364   }
365 }
366 
367 }  // namespace
368 
369 UBool
allocate(int32_t capacity)370 UnicodeString::allocate(int32_t capacity) {
371   if(capacity <= US_STACKBUF_SIZE) {
372     fUnion.fFields.fLengthAndFlags = kShortString;
373     return TRUE;
374   }
375   if(capacity <= kMaxCapacity) {
376     ++capacity;  // for the NUL
377     // Switch to size_t which is unsigned so that we can allocate up to 4GB.
378     // Reference counter + UChars.
379     size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR;
380     // Round up to a multiple of 16.
381     numBytes = (numBytes + 15) & ~15;
382     int32_t *array = (int32_t *) uprv_malloc(numBytes);
383     if(array != NULL) {
384       // set initial refCount and point behind the refCount
385       *array++ = 1;
386       numBytes -= sizeof(int32_t);
387 
388       // have fArray point to the first UChar
389       fUnion.fFields.fArray = (UChar *)array;
390       fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR);
391       fUnion.fFields.fLengthAndFlags = kLongString;
392       return TRUE;
393     }
394   }
395   fUnion.fFields.fLengthAndFlags = kIsBogus;
396   fUnion.fFields.fArray = 0;
397   fUnion.fFields.fCapacity = 0;
398   return FALSE;
399 }
400 
401 //========================================
402 // Destructor
403 //========================================
404 
405 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
406 static u_atomic_int32_t finalLengthCounts[0x400];  // UnicodeString::kMaxShortLength+1
407 static u_atomic_int32_t beyondCount(0);
408 
unistr_printLengths()409 U_CAPI void unistr_printLengths() {
410   int32_t i;
411   for(i = 0; i <= 59; ++i) {
412     printf("%2d,  %9d\n", i, (int32_t)finalLengthCounts[i]);
413   }
414   int32_t beyond = beyondCount;
415   for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
416     beyond += finalLengthCounts[i];
417   }
418   printf(">59, %9d\n", beyond);
419 }
420 #endif
421 
~UnicodeString()422 UnicodeString::~UnicodeString()
423 {
424 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
425   // Count lengths of strings at the end of their lifetime.
426   // Useful for discussion of a desirable stack buffer size.
427   // Count the contents length, not the optional NUL terminator nor further capacity.
428   // Ignore open-buffer strings and strings which alias external storage.
429   if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
430     if(hasShortLength()) {
431       umtx_atomic_inc(finalLengthCounts + getShortLength());
432     } else {
433       umtx_atomic_inc(&beyondCount);
434     }
435   }
436 #endif
437 
438   releaseArray();
439 }
440 
441 //========================================
442 // Factory methods
443 //========================================
444 
fromUTF8(StringPiece utf8)445 UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
446   UnicodeString result;
447   result.setToUTF8(utf8);
448   return result;
449 }
450 
fromUTF32(const UChar32 * utf32,int32_t length)451 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
452   UnicodeString result;
453   int32_t capacity;
454   // Most UTF-32 strings will be BMP-only and result in a same-length
455   // UTF-16 string. We overestimate the capacity just slightly,
456   // just in case there are a few supplementary characters.
457   if(length <= US_STACKBUF_SIZE) {
458     capacity = US_STACKBUF_SIZE;
459   } else {
460     capacity = length + (length >> 4) + 4;
461   }
462   do {
463     UChar *utf16 = result.getBuffer(capacity);
464     int32_t length16;
465     UErrorCode errorCode = U_ZERO_ERROR;
466     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
467         utf32, length,
468         0xfffd,  // Substitution character.
469         NULL,    // Don't care about number of substitutions.
470         &errorCode);
471     result.releaseBuffer(length16);
472     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
473       capacity = length16 + 1;  // +1 for the terminating NUL.
474       continue;
475     } else if(U_FAILURE(errorCode)) {
476       result.setToBogus();
477     }
478     break;
479   } while(TRUE);
480   return result;
481 }
482 
483 //========================================
484 // Assignment
485 //========================================
486 
487 UnicodeString &
operator =(const UnicodeString & src)488 UnicodeString::operator=(const UnicodeString &src) {
489   return copyFrom(src);
490 }
491 
492 UnicodeString &
fastCopyFrom(const UnicodeString & src)493 UnicodeString::fastCopyFrom(const UnicodeString &src) {
494   return copyFrom(src, TRUE);
495 }
496 
497 UnicodeString &
copyFrom(const UnicodeString & src,UBool fastCopy)498 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
499   // if assigning to ourselves, do nothing
500   if(this == &src) {
501     return *this;
502   }
503 
504   // is the right side bogus?
505   if(src.isBogus()) {
506     setToBogus();
507     return *this;
508   }
509 
510   // delete the current contents
511   releaseArray();
512 
513   if(src.isEmpty()) {
514     // empty string - use the stack buffer
515     setToEmpty();
516     return *this;
517   }
518 
519   // fLength>0 and not an "open" src.getBuffer(minCapacity)
520   fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
521   switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
522   case kShortString:
523     // short string using the stack buffer, do the same
524     uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
525                 getShortLength() * U_SIZEOF_UCHAR);
526     break;
527   case kLongString:
528     // src uses a refCounted string buffer, use that buffer with refCount
529     // src is const, use a cast - we don't actually change it
530     ((UnicodeString &)src).addRef();
531     // copy all fields, share the reference-counted buffer
532     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
533     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
534     if(!hasShortLength()) {
535       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
536     }
537     break;
538   case kReadonlyAlias:
539     if(fastCopy) {
540       // src is a readonly alias, do the same
541       // -> maintain the readonly alias as such
542       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
543       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
544       if(!hasShortLength()) {
545         fUnion.fFields.fLength = src.fUnion.fFields.fLength;
546       }
547       break;
548     }
549     // else if(!fastCopy) fall through to case kWritableAlias
550     // -> allocate a new buffer and copy the contents
551     U_FALLTHROUGH;
552   case kWritableAlias: {
553     // src is a writable alias; we make a copy of that instead
554     int32_t srcLength = src.length();
555     if(allocate(srcLength)) {
556       u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
557       setLength(srcLength);
558       break;
559     }
560     // if there is not enough memory, then fall through to setting to bogus
561     U_FALLTHROUGH;
562   }
563   default:
564     // if src is bogus, set ourselves to bogus
565     // do not call setToBogus() here because fArray and flags are not consistent here
566     fUnion.fFields.fLengthAndFlags = kIsBogus;
567     fUnion.fFields.fArray = 0;
568     fUnion.fFields.fCapacity = 0;
569     break;
570   }
571 
572   return *this;
573 }
574 
moveFrom(UnicodeString & src)575 UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT {
576   // No explicit check for self move assignment, consistent with standard library.
577   // Self move assignment causes no crash nor leak but might make the object bogus.
578   releaseArray();
579   copyFieldsFrom(src, TRUE);
580   return *this;
581 }
582 
583 // Same as moveFrom() except without memory management.
copyFieldsFrom(UnicodeString & src,UBool setSrcToBogus)584 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
585   int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
586   if(lengthAndFlags & kUsingStackBuffer) {
587     // Short string using the stack buffer, copy the contents.
588     // Check for self assignment to prevent "overlap in memcpy" warnings,
589     // although it should be harmless to copy a buffer to itself exactly.
590     if(this != &src) {
591       uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
592                   getShortLength() * U_SIZEOF_UCHAR);
593     }
594   } else {
595     // In all other cases, copy all fields.
596     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
597     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
598     if(!hasShortLength()) {
599       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
600     }
601     if(setSrcToBogus) {
602       // Set src to bogus without releasing any memory.
603       src.fUnion.fFields.fLengthAndFlags = kIsBogus;
604       src.fUnion.fFields.fArray = NULL;
605       src.fUnion.fFields.fCapacity = 0;
606     }
607   }
608 }
609 
swap(UnicodeString & other)610 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
611   UnicodeString temp;  // Empty short string: Known not to need releaseArray().
612   // Copy fields without resetting source values in between.
613   temp.copyFieldsFrom(*this, FALSE);
614   this->copyFieldsFrom(other, FALSE);
615   other.copyFieldsFrom(temp, FALSE);
616   // Set temp to an empty string so that other's memory is not released twice.
617   temp.fUnion.fFields.fLengthAndFlags = kShortString;
618 }
619 
620 //========================================
621 // Miscellaneous operations
622 //========================================
623 
unescape() const624 UnicodeString UnicodeString::unescape() const {
625     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
626     if (result.isBogus()) {
627         return result;
628     }
629     const UChar *array = getBuffer();
630     int32_t len = length();
631     int32_t prev = 0;
632     for (int32_t i=0;;) {
633         if (i == len) {
634             result.append(array, prev, len - prev);
635             break;
636         }
637         if (array[i++] == 0x5C /*'\\'*/) {
638             result.append(array, prev, (i - 1) - prev);
639             UChar32 c = unescapeAt(i); // advances i
640             if (c < 0) {
641                 result.remove(); // return empty string
642                 break; // invalid escape sequence
643             }
644             result.append(c);
645             prev = i;
646         }
647     }
648     return result;
649 }
650 
unescapeAt(int32_t & offset) const651 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
652     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
653 }
654 
655 //========================================
656 // Read-only implementation
657 //========================================
658 UBool
doEquals(const UnicodeString & text,int32_t len) const659 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
660   // Requires: this & text not bogus and have same lengths.
661   // Byte-wise comparison works for equality regardless of endianness.
662   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
663 }
664 
665 int8_t
doCompare(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const666 UnicodeString::doCompare( int32_t start,
667               int32_t length,
668               const UChar *srcChars,
669               int32_t srcStart,
670               int32_t srcLength) const
671 {
672   // compare illegal string values
673   if(isBogus()) {
674     return -1;
675   }
676 
677   // pin indices to legal values
678   pinIndices(start, length);
679 
680   if(srcChars == NULL) {
681     // treat const UChar *srcChars==NULL as an empty string
682     return length == 0 ? 0 : 1;
683   }
684 
685   // get the correct pointer
686   const UChar *chars = getArrayStart();
687 
688   chars += start;
689   srcChars += srcStart;
690 
691   int32_t minLength;
692   int8_t lengthResult;
693 
694   // get the srcLength if necessary
695   if(srcLength < 0) {
696     srcLength = u_strlen(srcChars + srcStart);
697   }
698 
699   // are we comparing different lengths?
700   if(length != srcLength) {
701     if(length < srcLength) {
702       minLength = length;
703       lengthResult = -1;
704     } else {
705       minLength = srcLength;
706       lengthResult = 1;
707     }
708   } else {
709     minLength = length;
710     lengthResult = 0;
711   }
712 
713   /*
714    * note that uprv_memcmp() returns an int but we return an int8_t;
715    * we need to take care not to truncate the result -
716    * one way to do this is to right-shift the value to
717    * move the sign bit into the lower 8 bits and making sure that this
718    * does not become 0 itself
719    */
720 
721   if(minLength > 0 && chars != srcChars) {
722     int32_t result;
723 
724 #   if U_IS_BIG_ENDIAN
725       // big-endian: byte comparison works
726       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
727       if(result != 0) {
728         return (int8_t)(result >> 15 | 1);
729       }
730 #   else
731       // little-endian: compare UChar units
732       do {
733         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
734         if(result != 0) {
735           return (int8_t)(result >> 15 | 1);
736         }
737       } while(--minLength > 0);
738 #   endif
739   }
740   return lengthResult;
741 }
742 
743 /* String compare in code point order - doCompare() compares in code unit order. */
744 int8_t
doCompareCodePointOrder(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const745 UnicodeString::doCompareCodePointOrder(int32_t start,
746                                        int32_t length,
747                                        const UChar *srcChars,
748                                        int32_t srcStart,
749                                        int32_t srcLength) const
750 {
751   // compare illegal string values
752   // treat const UChar *srcChars==NULL as an empty string
753   if(isBogus()) {
754     return -1;
755   }
756 
757   // pin indices to legal values
758   pinIndices(start, length);
759 
760   if(srcChars == NULL) {
761     srcStart = srcLength = 0;
762   }
763 
764   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
765   /* translate the 32-bit result into an 8-bit one */
766   if(diff!=0) {
767     return (int8_t)(diff >> 15 | 1);
768   } else {
769     return 0;
770   }
771 }
772 
773 int32_t
getLength() const774 UnicodeString::getLength() const {
775     return length();
776 }
777 
778 UChar
getCharAt(int32_t offset) const779 UnicodeString::getCharAt(int32_t offset) const {
780   return charAt(offset);
781 }
782 
783 UChar32
getChar32At(int32_t offset) const784 UnicodeString::getChar32At(int32_t offset) const {
785   return char32At(offset);
786 }
787 
788 UChar32
char32At(int32_t offset) const789 UnicodeString::char32At(int32_t offset) const
790 {
791   int32_t len = length();
792   if((uint32_t)offset < (uint32_t)len) {
793     const UChar *array = getArrayStart();
794     UChar32 c;
795     U16_GET(array, 0, offset, len, c);
796     return c;
797   } else {
798     return kInvalidUChar;
799   }
800 }
801 
802 int32_t
getChar32Start(int32_t offset) const803 UnicodeString::getChar32Start(int32_t offset) const {
804   if((uint32_t)offset < (uint32_t)length()) {
805     const UChar *array = getArrayStart();
806     U16_SET_CP_START(array, 0, offset);
807     return offset;
808   } else {
809     return 0;
810   }
811 }
812 
813 int32_t
getChar32Limit(int32_t offset) const814 UnicodeString::getChar32Limit(int32_t offset) const {
815   int32_t len = length();
816   if((uint32_t)offset < (uint32_t)len) {
817     const UChar *array = getArrayStart();
818     U16_SET_CP_LIMIT(array, 0, offset, len);
819     return offset;
820   } else {
821     return len;
822   }
823 }
824 
825 int32_t
countChar32(int32_t start,int32_t length) const826 UnicodeString::countChar32(int32_t start, int32_t length) const {
827   pinIndices(start, length);
828   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
829   return u_countChar32(getArrayStart()+start, length);
830 }
831 
832 UBool
hasMoreChar32Than(int32_t start,int32_t length,int32_t number) const833 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
834   pinIndices(start, length);
835   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
836   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
837 }
838 
839 int32_t
moveIndex32(int32_t index,int32_t delta) const840 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
841   // pin index
842   int32_t len = length();
843   if(index<0) {
844     index=0;
845   } else if(index>len) {
846     index=len;
847   }
848 
849   const UChar *array = getArrayStart();
850   if(delta>0) {
851     U16_FWD_N(array, index, len, delta);
852   } else {
853     U16_BACK_N(array, 0, index, -delta);
854   }
855 
856   return index;
857 }
858 
859 void
doExtract(int32_t start,int32_t length,UChar * dst,int32_t dstStart) const860 UnicodeString::doExtract(int32_t start,
861              int32_t length,
862              UChar *dst,
863              int32_t dstStart) const
864 {
865   // pin indices to legal values
866   pinIndices(start, length);
867 
868   // do not copy anything if we alias dst itself
869   const UChar *array = getArrayStart();
870   if(array + start != dst + dstStart) {
871     us_arrayCopy(array, start, dst, dstStart, length);
872   }
873 }
874 
875 int32_t
extract(Char16Ptr dest,int32_t destCapacity,UErrorCode & errorCode) const876 UnicodeString::extract(Char16Ptr dest, int32_t destCapacity,
877                        UErrorCode &errorCode) const {
878   int32_t len = length();
879   if(U_SUCCESS(errorCode)) {
880     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
881       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
882     } else {
883       const UChar *array = getArrayStart();
884       if(len>0 && len<=destCapacity && array!=dest) {
885         u_memcpy(dest, array, len);
886       }
887       return u_terminateUChars(dest, destCapacity, len, &errorCode);
888     }
889   }
890 
891   return len;
892 }
893 
894 int32_t
extract(int32_t start,int32_t length,char * target,int32_t targetCapacity,enum EInvariant) const895 UnicodeString::extract(int32_t start,
896                        int32_t length,
897                        char *target,
898                        int32_t targetCapacity,
899                        enum EInvariant) const
900 {
901   // if the arguments are illegal, then do nothing
902   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
903     return 0;
904   }
905 
906   // pin the indices to legal values
907   pinIndices(start, length);
908 
909   if(length <= targetCapacity) {
910     u_UCharsToChars(getArrayStart() + start, target, length);
911   }
912   UErrorCode status = U_ZERO_ERROR;
913   return u_terminateChars(target, targetCapacity, length, &status);
914 }
915 
916 UnicodeString
tempSubString(int32_t start,int32_t len) const917 UnicodeString::tempSubString(int32_t start, int32_t len) const {
918   pinIndices(start, len);
919   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
920   if(array==NULL) {
921     array=fUnion.fStackFields.fBuffer;  // anything not NULL because that would make an empty string
922     len=-2;  // bogus result string
923   }
924   return UnicodeString(FALSE, array + start, len);
925 }
926 
927 int32_t
toUTF8(int32_t start,int32_t len,char * target,int32_t capacity) const928 UnicodeString::toUTF8(int32_t start, int32_t len,
929                       char *target, int32_t capacity) const {
930   pinIndices(start, len);
931   int32_t length8;
932   UErrorCode errorCode = U_ZERO_ERROR;
933   u_strToUTF8WithSub(target, capacity, &length8,
934                      getBuffer() + start, len,
935                      0xFFFD,  // Standard substitution character.
936                      NULL,    // Don't care about number of substitutions.
937                      &errorCode);
938   return length8;
939 }
940 
941 #if U_CHARSET_IS_UTF8
942 
943 int32_t
extract(int32_t start,int32_t len,char * target,uint32_t dstSize) const944 UnicodeString::extract(int32_t start, int32_t len,
945                        char *target, uint32_t dstSize) const {
946   // if the arguments are illegal, then do nothing
947   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
948     return 0;
949   }
950   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
951 }
952 
953 // else see unistr_cnv.cpp
954 #endif
955 
956 void
extractBetween(int32_t start,int32_t limit,UnicodeString & target) const957 UnicodeString::extractBetween(int32_t start,
958                   int32_t limit,
959                   UnicodeString& target) const {
960   pinIndex(start);
961   pinIndex(limit);
962   doExtract(start, limit - start, target);
963 }
964 
965 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
966 // as many bytes as the source has UChars.
967 // The "worst cases" are writing systems like Indic, Thai and CJK with
968 // 3:1 bytes:UChars.
969 void
toUTF8(ByteSink & sink) const970 UnicodeString::toUTF8(ByteSink &sink) const {
971   int32_t length16 = length();
972   if(length16 != 0) {
973     char stackBuffer[1024];
974     int32_t capacity = (int32_t)sizeof(stackBuffer);
975     UBool utf8IsOwned = FALSE;
976     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
977                                       3*length16,
978                                       stackBuffer, capacity,
979                                       &capacity);
980     int32_t length8 = 0;
981     UErrorCode errorCode = U_ZERO_ERROR;
982     u_strToUTF8WithSub(utf8, capacity, &length8,
983                        getBuffer(), length16,
984                        0xFFFD,  // Standard substitution character.
985                        NULL,    // Don't care about number of substitutions.
986                        &errorCode);
987     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
988       utf8 = (char *)uprv_malloc(length8);
989       if(utf8 != NULL) {
990         utf8IsOwned = TRUE;
991         errorCode = U_ZERO_ERROR;
992         u_strToUTF8WithSub(utf8, length8, &length8,
993                            getBuffer(), length16,
994                            0xFFFD,  // Standard substitution character.
995                            NULL,    // Don't care about number of substitutions.
996                            &errorCode);
997       } else {
998         errorCode = U_MEMORY_ALLOCATION_ERROR;
999       }
1000     }
1001     if(U_SUCCESS(errorCode)) {
1002       sink.Append(utf8, length8);
1003       sink.Flush();
1004     }
1005     if(utf8IsOwned) {
1006       uprv_free(utf8);
1007     }
1008   }
1009 }
1010 
1011 int32_t
toUTF32(UChar32 * utf32,int32_t capacity,UErrorCode & errorCode) const1012 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
1013   int32_t length32=0;
1014   if(U_SUCCESS(errorCode)) {
1015     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
1016     u_strToUTF32WithSub(utf32, capacity, &length32,
1017         getBuffer(), length(),
1018         0xfffd,  // Substitution character.
1019         NULL,    // Don't care about number of substitutions.
1020         &errorCode);
1021   }
1022   return length32;
1023 }
1024 
1025 int32_t
indexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const1026 UnicodeString::indexOf(const UChar *srcChars,
1027                int32_t srcStart,
1028                int32_t srcLength,
1029                int32_t start,
1030                int32_t length) const
1031 {
1032   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1033     return -1;
1034   }
1035 
1036   // UnicodeString does not find empty substrings
1037   if(srcLength < 0 && srcChars[srcStart] == 0) {
1038     return -1;
1039   }
1040 
1041   // get the indices within bounds
1042   pinIndices(start, length);
1043 
1044   // find the first occurrence of the substring
1045   const UChar *array = getArrayStart();
1046   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1047   if(match == NULL) {
1048     return -1;
1049   } else {
1050     return (int32_t)(match - array);
1051   }
1052 }
1053 
1054 int32_t
doIndexOf(UChar c,int32_t start,int32_t length) const1055 UnicodeString::doIndexOf(UChar c,
1056              int32_t start,
1057              int32_t length) const
1058 {
1059   // pin indices
1060   pinIndices(start, length);
1061 
1062   // find the first occurrence of c
1063   const UChar *array = getArrayStart();
1064   const UChar *match = u_memchr(array + start, c, length);
1065   if(match == NULL) {
1066     return -1;
1067   } else {
1068     return (int32_t)(match - array);
1069   }
1070 }
1071 
1072 int32_t
doIndexOf(UChar32 c,int32_t start,int32_t length) const1073 UnicodeString::doIndexOf(UChar32 c,
1074                          int32_t start,
1075                          int32_t length) const {
1076   // pin indices
1077   pinIndices(start, length);
1078 
1079   // find the first occurrence of c
1080   const UChar *array = getArrayStart();
1081   const UChar *match = u_memchr32(array + start, c, length);
1082   if(match == NULL) {
1083     return -1;
1084   } else {
1085     return (int32_t)(match - array);
1086   }
1087 }
1088 
1089 int32_t
lastIndexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const1090 UnicodeString::lastIndexOf(const UChar *srcChars,
1091                int32_t srcStart,
1092                int32_t srcLength,
1093                int32_t start,
1094                int32_t length) const
1095 {
1096   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1097     return -1;
1098   }
1099 
1100   // UnicodeString does not find empty substrings
1101   if(srcLength < 0 && srcChars[srcStart] == 0) {
1102     return -1;
1103   }
1104 
1105   // get the indices within bounds
1106   pinIndices(start, length);
1107 
1108   // find the last occurrence of the substring
1109   const UChar *array = getArrayStart();
1110   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1111   if(match == NULL) {
1112     return -1;
1113   } else {
1114     return (int32_t)(match - array);
1115   }
1116 }
1117 
1118 int32_t
doLastIndexOf(UChar c,int32_t start,int32_t length) const1119 UnicodeString::doLastIndexOf(UChar c,
1120                  int32_t start,
1121                  int32_t length) const
1122 {
1123   if(isBogus()) {
1124     return -1;
1125   }
1126 
1127   // pin indices
1128   pinIndices(start, length);
1129 
1130   // find the last occurrence of c
1131   const UChar *array = getArrayStart();
1132   const UChar *match = u_memrchr(array + start, c, length);
1133   if(match == NULL) {
1134     return -1;
1135   } else {
1136     return (int32_t)(match - array);
1137   }
1138 }
1139 
1140 int32_t
doLastIndexOf(UChar32 c,int32_t start,int32_t length) const1141 UnicodeString::doLastIndexOf(UChar32 c,
1142                              int32_t start,
1143                              int32_t length) const {
1144   // pin indices
1145   pinIndices(start, length);
1146 
1147   // find the last occurrence of c
1148   const UChar *array = getArrayStart();
1149   const UChar *match = u_memrchr32(array + start, c, length);
1150   if(match == NULL) {
1151     return -1;
1152   } else {
1153     return (int32_t)(match - array);
1154   }
1155 }
1156 
1157 //========================================
1158 // Write implementation
1159 //========================================
1160 
1161 UnicodeString&
findAndReplace(int32_t start,int32_t length,const UnicodeString & oldText,int32_t oldStart,int32_t oldLength,const UnicodeString & newText,int32_t newStart,int32_t newLength)1162 UnicodeString::findAndReplace(int32_t start,
1163                   int32_t length,
1164                   const UnicodeString& oldText,
1165                   int32_t oldStart,
1166                   int32_t oldLength,
1167                   const UnicodeString& newText,
1168                   int32_t newStart,
1169                   int32_t newLength)
1170 {
1171   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1172     return *this;
1173   }
1174 
1175   pinIndices(start, length);
1176   oldText.pinIndices(oldStart, oldLength);
1177   newText.pinIndices(newStart, newLength);
1178 
1179   if(oldLength == 0) {
1180     return *this;
1181   }
1182 
1183   while(length > 0 && length >= oldLength) {
1184     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1185     if(pos < 0) {
1186       // no more oldText's here: done
1187       break;
1188     } else {
1189       // we found oldText, replace it by newText and go beyond it
1190       replace(pos, oldLength, newText, newStart, newLength);
1191       length -= pos + oldLength - start;
1192       start = pos + newLength;
1193     }
1194   }
1195 
1196   return *this;
1197 }
1198 
1199 
1200 void
setToBogus()1201 UnicodeString::setToBogus()
1202 {
1203   releaseArray();
1204 
1205   fUnion.fFields.fLengthAndFlags = kIsBogus;
1206   fUnion.fFields.fArray = 0;
1207   fUnion.fFields.fCapacity = 0;
1208 }
1209 
1210 // turn a bogus string into an empty one
1211 void
unBogus()1212 UnicodeString::unBogus() {
1213   if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1214     setToEmpty();
1215   }
1216 }
1217 
1218 const char16_t *
getTerminatedBuffer()1219 UnicodeString::getTerminatedBuffer() {
1220   if(!isWritable()) {
1221     return nullptr;
1222   }
1223   UChar *array = getArrayStart();
1224   int32_t len = length();
1225   if(len < getCapacity()) {
1226     if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1227       // If len<capacity on a read-only alias, then array[len] is
1228       // either the original NUL (if constructed with (TRUE, s, length))
1229       // or one of the original string contents characters (if later truncated),
1230       // therefore we can assume that array[len] is initialized memory.
1231       if(array[len] == 0) {
1232         return array;
1233       }
1234     } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1235       // kRefCounted: Do not write the NUL if the buffer is shared.
1236       // That is mostly safe, except when the length of one copy was modified
1237       // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1238       // Then the NUL would be written into the middle of another copy's string.
1239 
1240       // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1241       // Do not test if there is a NUL already because it might be uninitialized memory.
1242       // (That would be safe, but tools like valgrind & Purify would complain.)
1243       array[len] = 0;
1244       return array;
1245     }
1246   }
1247   if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) {
1248     array = getArrayStart();
1249     array[len] = 0;
1250     return array;
1251   } else {
1252     return nullptr;
1253   }
1254 }
1255 
1256 // setTo() analogous to the readonly-aliasing constructor with the same signature
1257 UnicodeString &
setTo(UBool isTerminated,ConstChar16Ptr textPtr,int32_t textLength)1258 UnicodeString::setTo(UBool isTerminated,
1259                      ConstChar16Ptr textPtr,
1260                      int32_t textLength)
1261 {
1262   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1263     // do not modify a string that has an "open" getBuffer(minCapacity)
1264     return *this;
1265   }
1266 
1267   const UChar *text = textPtr;
1268   if(text == NULL) {
1269     // treat as an empty string, do not alias
1270     releaseArray();
1271     setToEmpty();
1272     return *this;
1273   }
1274 
1275   if( textLength < -1 ||
1276       (textLength == -1 && !isTerminated) ||
1277       (textLength >= 0 && isTerminated && text[textLength] != 0)
1278   ) {
1279     setToBogus();
1280     return *this;
1281   }
1282 
1283   releaseArray();
1284 
1285   if(textLength == -1) {
1286     // text is terminated, or else it would have failed the above test
1287     textLength = u_strlen(text);
1288   }
1289   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1290   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1291   return *this;
1292 }
1293 
1294 // setTo() analogous to the writable-aliasing constructor with the same signature
1295 UnicodeString &
setTo(UChar * buffer,int32_t buffLength,int32_t buffCapacity)1296 UnicodeString::setTo(UChar *buffer,
1297                      int32_t buffLength,
1298                      int32_t buffCapacity) {
1299   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1300     // do not modify a string that has an "open" getBuffer(minCapacity)
1301     return *this;
1302   }
1303 
1304   if(buffer == NULL) {
1305     // treat as an empty string, do not alias
1306     releaseArray();
1307     setToEmpty();
1308     return *this;
1309   }
1310 
1311   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1312     setToBogus();
1313     return *this;
1314   } else if(buffLength == -1) {
1315     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1316     const UChar *p = buffer, *limit = buffer + buffCapacity;
1317     while(p != limit && *p != 0) {
1318       ++p;
1319     }
1320     buffLength = (int32_t)(p - buffer);
1321   }
1322 
1323   releaseArray();
1324 
1325   fUnion.fFields.fLengthAndFlags = kWritableAlias;
1326   setArray(buffer, buffLength, buffCapacity);
1327   return *this;
1328 }
1329 
setToUTF8(StringPiece utf8)1330 UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
1331   unBogus();
1332   int32_t length = utf8.length();
1333   int32_t capacity;
1334   // The UTF-16 string will be at most as long as the UTF-8 string.
1335   if(length <= US_STACKBUF_SIZE) {
1336     capacity = US_STACKBUF_SIZE;
1337   } else {
1338     capacity = length + 1;  // +1 for the terminating NUL.
1339   }
1340   UChar *utf16 = getBuffer(capacity);
1341   int32_t length16;
1342   UErrorCode errorCode = U_ZERO_ERROR;
1343   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1344       utf8.data(), length,
1345       0xfffd,  // Substitution character.
1346       NULL,    // Don't care about number of substitutions.
1347       &errorCode);
1348   releaseBuffer(length16);
1349   if(U_FAILURE(errorCode)) {
1350     setToBogus();
1351   }
1352   return *this;
1353 }
1354 
1355 UnicodeString&
setCharAt(int32_t offset,UChar c)1356 UnicodeString::setCharAt(int32_t offset,
1357              UChar c)
1358 {
1359   int32_t len = length();
1360   if(cloneArrayIfNeeded() && len > 0) {
1361     if(offset < 0) {
1362       offset = 0;
1363     } else if(offset >= len) {
1364       offset = len - 1;
1365     }
1366 
1367     getArrayStart()[offset] = c;
1368   }
1369   return *this;
1370 }
1371 
1372 UnicodeString&
replace(int32_t start,int32_t _length,UChar32 srcChar)1373 UnicodeString::replace(int32_t start,
1374                int32_t _length,
1375                UChar32 srcChar) {
1376   UChar buffer[U16_MAX_LENGTH];
1377   int32_t count = 0;
1378   UBool isError = FALSE;
1379   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1380   // We test isError so that the compiler does not complain that we don't.
1381   // If isError (srcChar is not a valid code point) then count==0 which means
1382   // we remove the source segment rather than replacing it with srcChar.
1383   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1384 }
1385 
1386 UnicodeString&
append(UChar32 srcChar)1387 UnicodeString::append(UChar32 srcChar) {
1388   UChar buffer[U16_MAX_LENGTH];
1389   int32_t _length = 0;
1390   UBool isError = FALSE;
1391   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1392   // We test isError so that the compiler does not complain that we don't.
1393   // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1394   return isError ? *this : doAppend(buffer, 0, _length);
1395 }
1396 
1397 UnicodeString&
doReplace(int32_t start,int32_t length,const UnicodeString & src,int32_t srcStart,int32_t srcLength)1398 UnicodeString::doReplace( int32_t start,
1399               int32_t length,
1400               const UnicodeString& src,
1401               int32_t srcStart,
1402               int32_t srcLength)
1403 {
1404   // pin the indices to legal values
1405   src.pinIndices(srcStart, srcLength);
1406 
1407   // get the characters from src
1408   // and replace the range in ourselves with them
1409   return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1410 }
1411 
1412 UnicodeString&
doReplace(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength)1413 UnicodeString::doReplace(int32_t start,
1414              int32_t length,
1415              const UChar *srcChars,
1416              int32_t srcStart,
1417              int32_t srcLength)
1418 {
1419   if(!isWritable()) {
1420     return *this;
1421   }
1422 
1423   int32_t oldLength = this->length();
1424 
1425   // optimize (read-only alias).remove(0, start) and .remove(start, end)
1426   if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1427     if(start == 0) {
1428       // remove prefix by adjusting the array pointer
1429       pinIndex(length);
1430       fUnion.fFields.fArray += length;
1431       fUnion.fFields.fCapacity -= length;
1432       setLength(oldLength - length);
1433       return *this;
1434     } else {
1435       pinIndex(start);
1436       if(length >= (oldLength - start)) {
1437         // remove suffix by reducing the length (like truncate())
1438         setLength(start);
1439         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
1440         return *this;
1441       }
1442     }
1443   }
1444 
1445   if(start == oldLength) {
1446     return doAppend(srcChars, srcStart, srcLength);
1447   }
1448 
1449   if(srcChars == 0) {
1450     srcLength = 0;
1451   } else {
1452     // Perform all remaining operations relative to srcChars + srcStart.
1453     // From this point forward, do not use srcStart.
1454     srcChars += srcStart;
1455     if (srcLength < 0) {
1456       // get the srcLength if necessary
1457       srcLength = u_strlen(srcChars);
1458     }
1459   }
1460 
1461   // pin the indices to legal values
1462   pinIndices(start, length);
1463 
1464   // Calculate the size of the string after the replace.
1465   // Avoid int32_t overflow.
1466   int32_t newLength = oldLength - length;
1467   if(srcLength > (INT32_MAX - newLength)) {
1468     setToBogus();
1469     return *this;
1470   }
1471   newLength += srcLength;
1472 
1473   // Check for insertion into ourself
1474   const UChar *oldArray = getArrayStart();
1475   if (isBufferWritable() &&
1476       oldArray < srcChars + srcLength &&
1477       srcChars < oldArray + oldLength) {
1478     // Copy into a new UnicodeString and start over
1479     UnicodeString copy(srcChars, srcLength);
1480     if (copy.isBogus()) {
1481       setToBogus();
1482       return *this;
1483     }
1484     return doReplace(start, length, copy.getArrayStart(), 0, srcLength);
1485   }
1486 
1487   // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1488   // therefore we need to keep the current fArray
1489   UChar oldStackBuffer[US_STACKBUF_SIZE];
1490   if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1491     // copy the stack buffer contents because it will be overwritten with
1492     // fUnion.fFields values
1493     u_memcpy(oldStackBuffer, oldArray, oldLength);
1494     oldArray = oldStackBuffer;
1495   }
1496 
1497   // clone our array and allocate a bigger array if needed
1498   int32_t *bufferToDelete = 0;
1499   if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
1500                          FALSE, &bufferToDelete)
1501   ) {
1502     return *this;
1503   }
1504 
1505   // now do the replace
1506 
1507   UChar *newArray = getArrayStart();
1508   if(newArray != oldArray) {
1509     // if fArray changed, then we need to copy everything except what will change
1510     us_arrayCopy(oldArray, 0, newArray, 0, start);
1511     us_arrayCopy(oldArray, start + length,
1512                  newArray, start + srcLength,
1513                  oldLength - (start + length));
1514   } else if(length != srcLength) {
1515     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1516     us_arrayCopy(oldArray, start + length,
1517                  newArray, start + srcLength,
1518                  oldLength - (start + length));
1519   }
1520 
1521   // now fill in the hole with the new string
1522   us_arrayCopy(srcChars, 0, newArray, start, srcLength);
1523 
1524   setLength(newLength);
1525 
1526   // delayed delete in case srcChars == fArray when we started, and
1527   // to keep oldArray alive for the above operations
1528   if (bufferToDelete) {
1529     uprv_free(bufferToDelete);
1530   }
1531 
1532   return *this;
1533 }
1534 
1535 // Versions of doReplace() only for append() variants.
1536 // doReplace() and doAppend() optimize for different cases.
1537 
1538 UnicodeString&
doAppend(const UnicodeString & src,int32_t srcStart,int32_t srcLength)1539 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1540   if(srcLength == 0) {
1541     return *this;
1542   }
1543 
1544   // pin the indices to legal values
1545   src.pinIndices(srcStart, srcLength);
1546   return doAppend(src.getArrayStart(), srcStart, srcLength);
1547 }
1548 
1549 UnicodeString&
doAppend(const UChar * srcChars,int32_t srcStart,int32_t srcLength)1550 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1551   if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1552     return *this;
1553   }
1554 
1555   // Perform all remaining operations relative to srcChars + srcStart.
1556   // From this point forward, do not use srcStart.
1557   srcChars += srcStart;
1558 
1559   if(srcLength < 0) {
1560     // get the srcLength if necessary
1561     if((srcLength = u_strlen(srcChars)) == 0) {
1562       return *this;
1563     }
1564   }
1565 
1566   int32_t oldLength = length();
1567   int32_t newLength = oldLength + srcLength;
1568 
1569   // Check for append onto ourself
1570   const UChar* oldArray = getArrayStart();
1571   if (isBufferWritable() &&
1572       oldArray < srcChars + srcLength &&
1573       srcChars < oldArray + oldLength) {
1574     // Copy into a new UnicodeString and start over
1575     UnicodeString copy(srcChars, srcLength);
1576     if (copy.isBogus()) {
1577       setToBogus();
1578       return *this;
1579     }
1580     return doAppend(copy.getArrayStart(), 0, srcLength);
1581   }
1582 
1583   // optimize append() onto a large-enough, owned string
1584   if((newLength <= getCapacity() && isBufferWritable()) ||
1585       cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
1586     UChar *newArray = getArrayStart();
1587     // Do not copy characters when
1588     //   UChar *buffer=str.getAppendBuffer(...);
1589     // is followed by
1590     //   str.append(buffer, length);
1591     // or
1592     //   str.appendString(buffer, length)
1593     // or similar.
1594     if(srcChars != newArray + oldLength) {
1595       us_arrayCopy(srcChars, 0, newArray, oldLength, srcLength);
1596     }
1597     setLength(newLength);
1598   }
1599   return *this;
1600 }
1601 
1602 /**
1603  * Replaceable API
1604  */
1605 void
handleReplaceBetween(int32_t start,int32_t limit,const UnicodeString & text)1606 UnicodeString::handleReplaceBetween(int32_t start,
1607                                     int32_t limit,
1608                                     const UnicodeString& text) {
1609     replaceBetween(start, limit, text);
1610 }
1611 
1612 /**
1613  * Replaceable API
1614  */
1615 void
copy(int32_t start,int32_t limit,int32_t dest)1616 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1617     if (limit <= start) {
1618         return; // Nothing to do; avoid bogus malloc call
1619     }
1620     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1621     // Check to make sure text is not null.
1622     if (text != NULL) {
1623 	    extractBetween(start, limit, text, 0);
1624 	    insert(dest, text, 0, limit - start);
1625 	    uprv_free(text);
1626     }
1627 }
1628 
1629 /**
1630  * Replaceable API
1631  *
1632  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1633  * so we implement this function here.
1634  */
hasMetaData() const1635 UBool Replaceable::hasMetaData() const {
1636     return TRUE;
1637 }
1638 
1639 /**
1640  * Replaceable API
1641  */
hasMetaData() const1642 UBool UnicodeString::hasMetaData() const {
1643     return FALSE;
1644 }
1645 
1646 UnicodeString&
doReverse(int32_t start,int32_t length)1647 UnicodeString::doReverse(int32_t start, int32_t length) {
1648   if(length <= 1 || !cloneArrayIfNeeded()) {
1649     return *this;
1650   }
1651 
1652   // pin the indices to legal values
1653   pinIndices(start, length);
1654   if(length <= 1) {  // pinIndices() might have shrunk the length
1655     return *this;
1656   }
1657 
1658   UChar *left = getArrayStart() + start;
1659   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
1660   UChar swap;
1661   UBool hasSupplementary = FALSE;
1662 
1663   // Before the loop we know left<right because length>=2.
1664   do {
1665     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1666     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1667     *right-- = swap;
1668   } while(left < right);
1669   // Make sure to test the middle code unit of an odd-length string.
1670   // Redundant if the length is even.
1671   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1672 
1673   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1674   if(hasSupplementary) {
1675     UChar swap2;
1676 
1677     left = getArrayStart() + start;
1678     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1679     while(left < right) {
1680       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1681         *left++ = swap2;
1682         *left++ = swap;
1683       } else {
1684         ++left;
1685       }
1686     }
1687   }
1688 
1689   return *this;
1690 }
1691 
1692 UBool
padLeading(int32_t targetLength,UChar padChar)1693 UnicodeString::padLeading(int32_t targetLength,
1694                           UChar padChar)
1695 {
1696   int32_t oldLength = length();
1697   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1698     return FALSE;
1699   } else {
1700     // move contents up by padding width
1701     UChar *array = getArrayStart();
1702     int32_t start = targetLength - oldLength;
1703     us_arrayCopy(array, 0, array, start, oldLength);
1704 
1705     // fill in padding character
1706     while(--start >= 0) {
1707       array[start] = padChar;
1708     }
1709     setLength(targetLength);
1710     return TRUE;
1711   }
1712 }
1713 
1714 UBool
padTrailing(int32_t targetLength,UChar padChar)1715 UnicodeString::padTrailing(int32_t targetLength,
1716                            UChar padChar)
1717 {
1718   int32_t oldLength = length();
1719   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1720     return FALSE;
1721   } else {
1722     // fill in padding character
1723     UChar *array = getArrayStart();
1724     int32_t length = targetLength;
1725     while(--length >= oldLength) {
1726       array[length] = padChar;
1727     }
1728     setLength(targetLength);
1729     return TRUE;
1730   }
1731 }
1732 
1733 //========================================
1734 // Hashing
1735 //========================================
1736 int32_t
doHashCode() const1737 UnicodeString::doHashCode() const
1738 {
1739     /* Delegate hash computation to uhash.  This makes UnicodeString
1740      * hashing consistent with UChar* hashing.  */
1741     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1742     if (hashCode == kInvalidHashCode) {
1743         hashCode = kEmptyHashCode;
1744     }
1745     return hashCode;
1746 }
1747 
1748 //========================================
1749 // External Buffer
1750 //========================================
1751 
1752 char16_t *
getBuffer(int32_t minCapacity)1753 UnicodeString::getBuffer(int32_t minCapacity) {
1754   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1755     fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1756     setZeroLength();
1757     return getArrayStart();
1758   } else {
1759     return nullptr;
1760   }
1761 }
1762 
1763 void
releaseBuffer(int32_t newLength)1764 UnicodeString::releaseBuffer(int32_t newLength) {
1765   if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1766     // set the new fLength
1767     int32_t capacity=getCapacity();
1768     if(newLength==-1) {
1769       // the new length is the string length, capped by fCapacity
1770       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1771       while(p<limit && *p!=0) {
1772         ++p;
1773       }
1774       newLength=(int32_t)(p-array);
1775     } else if(newLength>capacity) {
1776       newLength=capacity;
1777     }
1778     setLength(newLength);
1779     fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1780   }
1781 }
1782 
1783 //========================================
1784 // Miscellaneous
1785 //========================================
1786 UBool
cloneArrayIfNeeded(int32_t newCapacity,int32_t growCapacity,UBool doCopyArray,int32_t ** pBufferToDelete,UBool forceClone)1787 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1788                                   int32_t growCapacity,
1789                                   UBool doCopyArray,
1790                                   int32_t **pBufferToDelete,
1791                                   UBool forceClone) {
1792   // default parameters need to be static, therefore
1793   // the defaults are -1 to have convenience defaults
1794   if(newCapacity == -1) {
1795     newCapacity = getCapacity();
1796   }
1797 
1798   // while a getBuffer(minCapacity) is "open",
1799   // prevent any modifications of the string by returning FALSE here
1800   // if the string is bogus, then only an assignment or similar can revive it
1801   if(!isWritable()) {
1802     return FALSE;
1803   }
1804 
1805   /*
1806    * We need to make a copy of the array if
1807    * the buffer is read-only, or
1808    * the buffer is refCounted (shared), and refCount>1, or
1809    * the buffer is too small.
1810    * Return FALSE if memory could not be allocated.
1811    */
1812   if(forceClone ||
1813      fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1814      (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1815      newCapacity > getCapacity()
1816   ) {
1817     // check growCapacity for default value and use of the stack buffer
1818     if(growCapacity < 0) {
1819       growCapacity = newCapacity;
1820     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1821       growCapacity = US_STACKBUF_SIZE;
1822     }
1823 
1824     // save old values
1825     UChar oldStackBuffer[US_STACKBUF_SIZE];
1826     UChar *oldArray;
1827     int32_t oldLength = length();
1828     int16_t flags = fUnion.fFields.fLengthAndFlags;
1829 
1830     if(flags&kUsingStackBuffer) {
1831       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1832       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1833         // copy the stack buffer contents because it will be overwritten with
1834         // fUnion.fFields values
1835         us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1836         oldArray = oldStackBuffer;
1837       } else {
1838         oldArray = NULL; // no need to copy from the stack buffer to itself
1839       }
1840     } else {
1841       oldArray = fUnion.fFields.fArray;
1842       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1843     }
1844 
1845     // allocate a new array
1846     if(allocate(growCapacity) ||
1847        (newCapacity < growCapacity && allocate(newCapacity))
1848     ) {
1849       if(doCopyArray) {
1850         // copy the contents
1851         // do not copy more than what fits - it may be smaller than before
1852         int32_t minLength = oldLength;
1853         newCapacity = getCapacity();
1854         if(newCapacity < minLength) {
1855           minLength = newCapacity;
1856         }
1857         if(oldArray != NULL) {
1858           us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1859         }
1860         setLength(minLength);
1861       } else {
1862         setZeroLength();
1863       }
1864 
1865       // release the old array
1866       if(flags & kRefCounted) {
1867         // the array is refCounted; decrement and release if 0
1868         u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1869         if(umtx_atomic_dec(pRefCount) == 0) {
1870           if(pBufferToDelete == 0) {
1871               // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1872               // is defined as volatile. (Volatile has useful non-standard behavior
1873               //   with this compiler.)
1874             uprv_free((void *)pRefCount);
1875           } else {
1876             // the caller requested to delete it himself
1877             *pBufferToDelete = (int32_t *)pRefCount;
1878           }
1879         }
1880       }
1881     } else {
1882       // not enough memory for growCapacity and not even for the smaller newCapacity
1883       // reset the old values for setToBogus() to release the array
1884       if(!(flags&kUsingStackBuffer)) {
1885         fUnion.fFields.fArray = oldArray;
1886       }
1887       fUnion.fFields.fLengthAndFlags = flags;
1888       setToBogus();
1889       return FALSE;
1890     }
1891   }
1892   return TRUE;
1893 }
1894 
1895 // UnicodeStringAppendable ------------------------------------------------- ***
1896 
~UnicodeStringAppendable()1897 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1898 
1899 UBool
appendCodeUnit(UChar c)1900 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1901   return str.doAppend(&c, 0, 1).isWritable();
1902 }
1903 
1904 UBool
appendCodePoint(UChar32 c)1905 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1906   UChar buffer[U16_MAX_LENGTH];
1907   int32_t cLength = 0;
1908   UBool isError = FALSE;
1909   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1910   return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1911 }
1912 
1913 UBool
appendString(const UChar * s,int32_t length)1914 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1915   return str.doAppend(s, 0, length).isWritable();
1916 }
1917 
1918 UBool
reserveAppendCapacity(int32_t appendCapacity)1919 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1920   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1921 }
1922 
1923 UChar *
getAppendBuffer(int32_t minCapacity,int32_t desiredCapacityHint,UChar * scratch,int32_t scratchCapacity,int32_t * resultCapacity)1924 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1925                                          int32_t desiredCapacityHint,
1926                                          UChar *scratch, int32_t scratchCapacity,
1927                                          int32_t *resultCapacity) {
1928   if(minCapacity < 1 || scratchCapacity < minCapacity) {
1929     *resultCapacity = 0;
1930     return NULL;
1931   }
1932   int32_t oldLength = str.length();
1933   if(minCapacity <= (kMaxCapacity - oldLength) &&
1934       desiredCapacityHint <= (kMaxCapacity - oldLength) &&
1935       str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1936     *resultCapacity = str.getCapacity() - oldLength;
1937     return str.getArrayStart() + oldLength;
1938   }
1939   *resultCapacity = scratchCapacity;
1940   return scratch;
1941 }
1942 
1943 U_NAMESPACE_END
1944 
1945 U_NAMESPACE_USE
1946 
1947 U_CAPI int32_t U_EXPORT2
uhash_hashUnicodeString(const UElement key)1948 uhash_hashUnicodeString(const UElement key) {
1949     const UnicodeString *str = (const UnicodeString*) key.pointer;
1950     return (str == NULL) ? 0 : str->hashCode();
1951 }
1952 
1953 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1954 // does not depend on hashtable code.
1955 U_CAPI UBool U_EXPORT2
uhash_compareUnicodeString(const UElement key1,const UElement key2)1956 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1957     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1958     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1959     if (str1 == str2) {
1960         return TRUE;
1961     }
1962     if (str1 == NULL || str2 == NULL) {
1963         return FALSE;
1964     }
1965     return *str1 == *str2;
1966 }
1967 
1968 #ifdef U_STATIC_IMPLEMENTATION
1969 /*
1970 This should never be called. It is defined here to make sure that the
1971 virtual vector deleting destructor is defined within unistr.cpp.
1972 The vector deleting destructor is already a part of UObject,
1973 but defining it here makes sure that it is included with this object file.
1974 This makes sure that static library dependencies are kept to a minimum.
1975 */
uprv_UnicodeStringDummy(void)1976 static void uprv_UnicodeStringDummy(void) {
1977     delete [] (new UnicodeString[2]);
1978 }
1979 #endif
1980