1 /*
2 ******************************************************************************
3 * Copyright (C) 1999-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
6 *
7 * File unistr.cpp
8 *
9 * Modification History:
10 *
11 *   Date        Name        Description
12 *   09/25/98    stephen     Creation.
13 *   04/20/99    stephen     Overhauled per 4/16 code review.
14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
16 *                           Replaceable.
17 *   06/25/01    grhoten     Removed the dependency on iostream
18 ******************************************************************************
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/appendable.h"
23 #include "unicode/putil.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
30 #include "uelement.h"
31 #include "ustr_imp.h"
32 #include "umutex.h"
33 #include "uassert.h"
34 
35 #if 0
36 
37 #include <iostream>
38 using namespace std;
39 
40 //DEBUGGING
41 void
42 print(const UnicodeString& s,
43       const char *name)
44 {
45   UChar c;
46   cout << name << ":|";
47   for(int i = 0; i < s.length(); ++i) {
48     c = s[i];
49     if(c>= 0x007E || c < 0x0020)
50       cout << "[0x" << hex << s[i] << "]";
51     else
52       cout << (char) s[i];
53   }
54   cout << '|' << endl;
55 }
56 
57 void
58 print(const UChar *s,
59       int32_t len,
60       const char *name)
61 {
62   UChar c;
63   cout << name << ":|";
64   for(int i = 0; i < len; ++i) {
65     c = s[i];
66     if(c>= 0x007E || c < 0x0020)
67       cout << "[0x" << hex << s[i] << "]";
68     else
69       cout << (char) s[i];
70   }
71   cout << '|' << endl;
72 }
73 // END DEBUGGING
74 #endif
75 
76 // Local function definitions for now
77 
78 // need to copy areas that may overlap
79 static
80 inline void
us_arrayCopy(const UChar * src,int32_t srcStart,UChar * dst,int32_t dstStart,int32_t count)81 us_arrayCopy(const UChar *src, int32_t srcStart,
82          UChar *dst, int32_t dstStart, int32_t count)
83 {
84   if(count>0) {
85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
86   }
87 }
88 
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
90 U_CDECL_BEGIN
91 static UChar U_CALLCONV
UnicodeString_charAt(int32_t offset,void * context)92 UnicodeString_charAt(int32_t offset, void *context) {
93     return ((icu::UnicodeString*) context)->charAt(offset);
94 }
95 U_CDECL_END
96 
97 U_NAMESPACE_BEGIN
98 
99 /* The Replaceable virtual destructor can't be defined in the header
100    due to how AIX works with multiple definitions of virtual functions.
101 */
~Replaceable()102 Replaceable::~Replaceable() {}
103 
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
105 
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
108     return
109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
110             append(s1).
111                 append(s2);
112 }
113 
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 //                               have a chance to automatically inline.
117 //========================================
118 
119 void
addRef()120 UnicodeString::addRef() {
121   umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
122 }
123 
124 int32_t
removeRef()125 UnicodeString::removeRef() {
126   return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
127 }
128 
129 int32_t
refCount() const130 UnicodeString::refCount() const {
131   return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
132 }
133 
134 void
releaseArray()135 UnicodeString::releaseArray() {
136   if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
137     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
138   }
139 }
140 
141 
142 
143 //========================================
144 // Constructors
145 //========================================
146 
147 // The default constructor is inline in unistr.h.
148 
UnicodeString(int32_t capacity,UChar32 c,int32_t count)149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
150   fUnion.fFields.fLengthAndFlags = 0;
151   if(count <= 0 || (uint32_t)c > 0x10ffff) {
152     // just allocate and do not do anything else
153     allocate(capacity);
154   } else {
155     // count > 0, allocate and fill the new string with count c's
156     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
157     if(capacity < length) {
158       capacity = length;
159     }
160     if(allocate(capacity)) {
161       UChar *array = getArrayStart();
162       int32_t i = 0;
163 
164       // fill the new string with c
165       if(unitCount == 1) {
166         // fill with length UChars
167         while(i < length) {
168           array[i++] = (UChar)c;
169         }
170       } else {
171         // get the code units for c
172         UChar units[U16_MAX_LENGTH];
173         U16_APPEND_UNSAFE(units, i, c);
174 
175         // now it must be i==unitCount
176         i = 0;
177 
178         // for Unicode, unitCount can only be 1, 2, 3, or 4
179         // 1 is handled above
180         while(i < length) {
181           int32_t unitIdx = 0;
182           while(unitIdx < unitCount) {
183             array[i++]=units[unitIdx++];
184           }
185         }
186       }
187     }
188     setLength(length);
189   }
190 }
191 
UnicodeString(UChar ch)192 UnicodeString::UnicodeString(UChar ch) {
193   fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194   fUnion.fStackFields.fBuffer[0] = ch;
195 }
196 
UnicodeString(UChar32 ch)197 UnicodeString::UnicodeString(UChar32 ch) {
198   fUnion.fFields.fLengthAndFlags = kShortString;
199   int32_t i = 0;
200   UBool isError = FALSE;
201   U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202   // We test isError so that the compiler does not complain that we don't.
203   // If isError then i==0 which is what we want anyway.
204   if(!isError) {
205     setShortLength(i);
206   }
207 }
208 
UnicodeString(const UChar * text)209 UnicodeString::UnicodeString(const UChar *text) {
210   fUnion.fFields.fLengthAndFlags = kShortString;
211   doReplace(0, 0, text, 0, -1);
212 }
213 
UnicodeString(const UChar * text,int32_t textLength)214 UnicodeString::UnicodeString(const UChar *text,
215                              int32_t textLength) {
216   fUnion.fFields.fLengthAndFlags = kShortString;
217   doReplace(0, 0, text, 0, textLength);
218 }
219 
UnicodeString(UBool isTerminated,const UChar * text,int32_t textLength)220 UnicodeString::UnicodeString(UBool isTerminated,
221                              const UChar *text,
222                              int32_t textLength) {
223   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224   if(text == NULL) {
225     // treat as an empty string, do not alias
226     setToEmpty();
227   } else if(textLength < -1 ||
228             (textLength == -1 && !isTerminated) ||
229             (textLength >= 0 && isTerminated && text[textLength] != 0)
230   ) {
231     setToBogus();
232   } else {
233     if(textLength == -1) {
234       // text is terminated, or else it would have failed the above test
235       textLength = u_strlen(text);
236     }
237     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
238   }
239 }
240 
UnicodeString(UChar * buff,int32_t buffLength,int32_t buffCapacity)241 UnicodeString::UnicodeString(UChar *buff,
242                              int32_t buffLength,
243                              int32_t buffCapacity) {
244   fUnion.fFields.fLengthAndFlags = kWritableAlias;
245   if(buff == NULL) {
246     // treat as an empty string, do not alias
247     setToEmpty();
248   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
249     setToBogus();
250   } else {
251     if(buffLength == -1) {
252       // fLength = u_strlen(buff); but do not look beyond buffCapacity
253       const UChar *p = buff, *limit = buff + buffCapacity;
254       while(p != limit && *p != 0) {
255         ++p;
256       }
257       buffLength = (int32_t)(p - buff);
258     }
259     setArray(buff, buffLength, buffCapacity);
260   }
261 }
262 
UnicodeString(const char * src,int32_t length,EInvariant)263 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
264   fUnion.fFields.fLengthAndFlags = kShortString;
265   if(src==NULL) {
266     // treat as an empty string
267   } else {
268     if(length<0) {
269       length=(int32_t)uprv_strlen(src);
270     }
271     if(cloneArrayIfNeeded(length, length, FALSE)) {
272       u_charsToUChars(src, getArrayStart(), length);
273       setLength(length);
274     } else {
275       setToBogus();
276     }
277   }
278 }
279 
280 #if U_CHARSET_IS_UTF8
281 
UnicodeString(const char * codepageData)282 UnicodeString::UnicodeString(const char *codepageData) {
283   fUnion.fFields.fLengthAndFlags = kShortString;
284   if(codepageData != 0) {
285     setToUTF8(codepageData);
286   }
287 }
288 
UnicodeString(const char * codepageData,int32_t dataLength)289 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
290   fUnion.fFields.fLengthAndFlags = kShortString;
291   // if there's nothing to convert, do nothing
292   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
293     return;
294   }
295   if(dataLength == -1) {
296     dataLength = (int32_t)uprv_strlen(codepageData);
297   }
298   setToUTF8(StringPiece(codepageData, dataLength));
299 }
300 
301 // else see unistr_cnv.cpp
302 #endif
303 
UnicodeString(const UnicodeString & that)304 UnicodeString::UnicodeString(const UnicodeString& that) {
305   fUnion.fFields.fLengthAndFlags = kShortString;
306   copyFrom(that);
307 }
308 
UnicodeString(const UnicodeString & that,int32_t srcStart)309 UnicodeString::UnicodeString(const UnicodeString& that,
310                              int32_t srcStart) {
311   fUnion.fFields.fLengthAndFlags = kShortString;
312   setTo(that, srcStart);
313 }
314 
UnicodeString(const UnicodeString & that,int32_t srcStart,int32_t srcLength)315 UnicodeString::UnicodeString(const UnicodeString& that,
316                              int32_t srcStart,
317                              int32_t srcLength) {
318   fUnion.fFields.fLengthAndFlags = kShortString;
319   setTo(that, srcStart, srcLength);
320 }
321 
322 // Replaceable base class clone() default implementation, does not clone
323 Replaceable *
clone() const324 Replaceable::clone() const {
325   return NULL;
326 }
327 
328 // UnicodeString overrides clone() with a real implementation
329 Replaceable *
clone() const330 UnicodeString::clone() const {
331   return new UnicodeString(*this);
332 }
333 
334 //========================================
335 // array allocation
336 //========================================
337 
338 UBool
allocate(int32_t capacity)339 UnicodeString::allocate(int32_t capacity) {
340   if(capacity <= US_STACKBUF_SIZE) {
341     fUnion.fFields.fLengthAndFlags = kShortString;
342   } else {
343     // count bytes for the refCounter and the string capacity, and
344     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
345     // to be safely aligned for the refCount
346     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
347     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
348     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
349     if(array != 0) {
350       // set initial refCount and point behind the refCount
351       *array++ = 1;
352 
353       // have fArray point to the first UChar
354       fUnion.fFields.fArray = (UChar *)array;
355       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
356       fUnion.fFields.fLengthAndFlags = kLongString;
357     } else {
358       fUnion.fFields.fLengthAndFlags = kIsBogus;
359       fUnion.fFields.fArray = 0;
360       fUnion.fFields.fCapacity = 0;
361       return FALSE;
362     }
363   }
364   return TRUE;
365 }
366 
367 //========================================
368 // Destructor
369 //========================================
~UnicodeString()370 UnicodeString::~UnicodeString()
371 {
372   releaseArray();
373 }
374 
375 //========================================
376 // Factory methods
377 //========================================
378 
fromUTF8(const StringPiece & utf8)379 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
380   UnicodeString result;
381   result.setToUTF8(utf8);
382   return result;
383 }
384 
fromUTF32(const UChar32 * utf32,int32_t length)385 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
386   UnicodeString result;
387   int32_t capacity;
388   // Most UTF-32 strings will be BMP-only and result in a same-length
389   // UTF-16 string. We overestimate the capacity just slightly,
390   // just in case there are a few supplementary characters.
391   if(length <= US_STACKBUF_SIZE) {
392     capacity = US_STACKBUF_SIZE;
393   } else {
394     capacity = length + (length >> 4) + 4;
395   }
396   do {
397     UChar *utf16 = result.getBuffer(capacity);
398     int32_t length16;
399     UErrorCode errorCode = U_ZERO_ERROR;
400     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
401         utf32, length,
402         0xfffd,  // Substitution character.
403         NULL,    // Don't care about number of substitutions.
404         &errorCode);
405     result.releaseBuffer(length16);
406     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
407       capacity = length16 + 1;  // +1 for the terminating NUL.
408       continue;
409     } else if(U_FAILURE(errorCode)) {
410       result.setToBogus();
411     }
412     break;
413   } while(TRUE);
414   return result;
415 }
416 
417 //========================================
418 // Assignment
419 //========================================
420 
421 UnicodeString &
operator =(const UnicodeString & src)422 UnicodeString::operator=(const UnicodeString &src) {
423   return copyFrom(src);
424 }
425 
426 UnicodeString &
fastCopyFrom(const UnicodeString & src)427 UnicodeString::fastCopyFrom(const UnicodeString &src) {
428   return copyFrom(src, TRUE);
429 }
430 
431 UnicodeString &
copyFrom(const UnicodeString & src,UBool fastCopy)432 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
433   // if assigning to ourselves, do nothing
434   if(this == &src) {
435     return *this;
436   }
437 
438   // is the right side bogus?
439   if(src.isBogus()) {
440     setToBogus();
441     return *this;
442   }
443 
444   // delete the current contents
445   releaseArray();
446 
447   if(src.isEmpty()) {
448     // empty string - use the stack buffer
449     setToEmpty();
450     return *this;
451   }
452 
453   // fLength>0 and not an "open" src.getBuffer(minCapacity)
454   fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
455   switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
456   case kShortString:
457     // short string using the stack buffer, do the same
458     uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
459                 getShortLength() * U_SIZEOF_UCHAR);
460     break;
461   case kLongString:
462     // src uses a refCounted string buffer, use that buffer with refCount
463     // src is const, use a cast - we don't actually change it
464     ((UnicodeString &)src).addRef();
465     // copy all fields, share the reference-counted buffer
466     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
467     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
468     if(!hasShortLength()) {
469       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
470     }
471     break;
472   case kReadonlyAlias:
473     if(fastCopy) {
474       // src is a readonly alias, do the same
475       // -> maintain the readonly alias as such
476       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
477       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
478       if(!hasShortLength()) {
479         fUnion.fFields.fLength = src.fUnion.fFields.fLength;
480       }
481       break;
482     }
483     // else if(!fastCopy) fall through to case kWritableAlias
484     // -> allocate a new buffer and copy the contents
485   case kWritableAlias: {
486     // src is a writable alias; we make a copy of that instead
487     int32_t srcLength = src.length();
488     if(allocate(srcLength)) {
489       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
490       setLength(srcLength);
491       break;
492     }
493     // if there is not enough memory, then fall through to setting to bogus
494   }
495   default:
496     // if src is bogus, set ourselves to bogus
497     // do not call setToBogus() here because fArray and flags are not consistent here
498     fUnion.fFields.fLengthAndFlags = kIsBogus;
499     fUnion.fFields.fArray = 0;
500     fUnion.fFields.fCapacity = 0;
501     break;
502   }
503 
504   return *this;
505 }
506 
507 //========================================
508 // Miscellaneous operations
509 //========================================
510 
unescape() const511 UnicodeString UnicodeString::unescape() const {
512     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
513     const UChar *array = getBuffer();
514     int32_t len = length();
515     int32_t prev = 0;
516     for (int32_t i=0;;) {
517         if (i == len) {
518             result.append(array, prev, len - prev);
519             break;
520         }
521         if (array[i++] == 0x5C /*'\\'*/) {
522             result.append(array, prev, (i - 1) - prev);
523             UChar32 c = unescapeAt(i); // advances i
524             if (c < 0) {
525                 result.remove(); // return empty string
526                 break; // invalid escape sequence
527             }
528             result.append(c);
529             prev = i;
530         }
531     }
532     return result;
533 }
534 
unescapeAt(int32_t & offset) const535 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
536     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
537 }
538 
539 //========================================
540 // Read-only implementation
541 //========================================
542 UBool
doEquals(const UnicodeString & text,int32_t len) const543 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
544   // Requires: this & text not bogus and have same lengths.
545   // Byte-wise comparison works for equality regardless of endianness.
546   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
547 }
548 
549 int8_t
doCompare(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const550 UnicodeString::doCompare( int32_t start,
551               int32_t length,
552               const UChar *srcChars,
553               int32_t srcStart,
554               int32_t srcLength) const
555 {
556   // compare illegal string values
557   if(isBogus()) {
558     return -1;
559   }
560 
561   // pin indices to legal values
562   pinIndices(start, length);
563 
564   if(srcChars == NULL) {
565     // treat const UChar *srcChars==NULL as an empty string
566     return length == 0 ? 0 : 1;
567   }
568 
569   // get the correct pointer
570   const UChar *chars = getArrayStart();
571 
572   chars += start;
573   srcChars += srcStart;
574 
575   int32_t minLength;
576   int8_t lengthResult;
577 
578   // get the srcLength if necessary
579   if(srcLength < 0) {
580     srcLength = u_strlen(srcChars + srcStart);
581   }
582 
583   // are we comparing different lengths?
584   if(length != srcLength) {
585     if(length < srcLength) {
586       minLength = length;
587       lengthResult = -1;
588     } else {
589       minLength = srcLength;
590       lengthResult = 1;
591     }
592   } else {
593     minLength = length;
594     lengthResult = 0;
595   }
596 
597   /*
598    * note that uprv_memcmp() returns an int but we return an int8_t;
599    * we need to take care not to truncate the result -
600    * one way to do this is to right-shift the value to
601    * move the sign bit into the lower 8 bits and making sure that this
602    * does not become 0 itself
603    */
604 
605   if(minLength > 0 && chars != srcChars) {
606     int32_t result;
607 
608 #   if U_IS_BIG_ENDIAN
609       // big-endian: byte comparison works
610       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
611       if(result != 0) {
612         return (int8_t)(result >> 15 | 1);
613       }
614 #   else
615       // little-endian: compare UChar units
616       do {
617         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
618         if(result != 0) {
619           return (int8_t)(result >> 15 | 1);
620         }
621       } while(--minLength > 0);
622 #   endif
623   }
624   return lengthResult;
625 }
626 
627 /* String compare in code point order - doCompare() compares in code unit order. */
628 int8_t
doCompareCodePointOrder(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const629 UnicodeString::doCompareCodePointOrder(int32_t start,
630                                        int32_t length,
631                                        const UChar *srcChars,
632                                        int32_t srcStart,
633                                        int32_t srcLength) const
634 {
635   // compare illegal string values
636   // treat const UChar *srcChars==NULL as an empty string
637   if(isBogus()) {
638     return -1;
639   }
640 
641   // pin indices to legal values
642   pinIndices(start, length);
643 
644   if(srcChars == NULL) {
645     srcStart = srcLength = 0;
646   }
647 
648   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
649   /* translate the 32-bit result into an 8-bit one */
650   if(diff!=0) {
651     return (int8_t)(diff >> 15 | 1);
652   } else {
653     return 0;
654   }
655 }
656 
657 int32_t
getLength() const658 UnicodeString::getLength() const {
659     return length();
660 }
661 
662 UChar
getCharAt(int32_t offset) const663 UnicodeString::getCharAt(int32_t offset) const {
664   return charAt(offset);
665 }
666 
667 UChar32
getChar32At(int32_t offset) const668 UnicodeString::getChar32At(int32_t offset) const {
669   return char32At(offset);
670 }
671 
672 UChar32
char32At(int32_t offset) const673 UnicodeString::char32At(int32_t offset) const
674 {
675   int32_t len = length();
676   if((uint32_t)offset < (uint32_t)len) {
677     const UChar *array = getArrayStart();
678     UChar32 c;
679     U16_GET(array, 0, offset, len, c);
680     return c;
681   } else {
682     return kInvalidUChar;
683   }
684 }
685 
686 int32_t
getChar32Start(int32_t offset) const687 UnicodeString::getChar32Start(int32_t offset) const {
688   if((uint32_t)offset < (uint32_t)length()) {
689     const UChar *array = getArrayStart();
690     U16_SET_CP_START(array, 0, offset);
691     return offset;
692   } else {
693     return 0;
694   }
695 }
696 
697 int32_t
getChar32Limit(int32_t offset) const698 UnicodeString::getChar32Limit(int32_t offset) const {
699   int32_t len = length();
700   if((uint32_t)offset < (uint32_t)len) {
701     const UChar *array = getArrayStart();
702     U16_SET_CP_LIMIT(array, 0, offset, len);
703     return offset;
704   } else {
705     return len;
706   }
707 }
708 
709 int32_t
countChar32(int32_t start,int32_t length) const710 UnicodeString::countChar32(int32_t start, int32_t length) const {
711   pinIndices(start, length);
712   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
713   return u_countChar32(getArrayStart()+start, length);
714 }
715 
716 UBool
hasMoreChar32Than(int32_t start,int32_t length,int32_t number) const717 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
718   pinIndices(start, length);
719   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
720   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
721 }
722 
723 int32_t
moveIndex32(int32_t index,int32_t delta) const724 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
725   // pin index
726   int32_t len = length();
727   if(index<0) {
728     index=0;
729   } else if(index>len) {
730     index=len;
731   }
732 
733   const UChar *array = getArrayStart();
734   if(delta>0) {
735     U16_FWD_N(array, index, len, delta);
736   } else {
737     U16_BACK_N(array, 0, index, -delta);
738   }
739 
740   return index;
741 }
742 
743 void
doExtract(int32_t start,int32_t length,UChar * dst,int32_t dstStart) const744 UnicodeString::doExtract(int32_t start,
745              int32_t length,
746              UChar *dst,
747              int32_t dstStart) const
748 {
749   // pin indices to legal values
750   pinIndices(start, length);
751 
752   // do not copy anything if we alias dst itself
753   const UChar *array = getArrayStart();
754   if(array + start != dst + dstStart) {
755     us_arrayCopy(array, start, dst, dstStart, length);
756   }
757 }
758 
759 int32_t
extract(UChar * dest,int32_t destCapacity,UErrorCode & errorCode) const760 UnicodeString::extract(UChar *dest, int32_t destCapacity,
761                        UErrorCode &errorCode) const {
762   int32_t len = length();
763   if(U_SUCCESS(errorCode)) {
764     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
765       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
766     } else {
767       const UChar *array = getArrayStart();
768       if(len>0 && len<=destCapacity && array!=dest) {
769         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
770       }
771       return u_terminateUChars(dest, destCapacity, len, &errorCode);
772     }
773   }
774 
775   return len;
776 }
777 
778 int32_t
extract(int32_t start,int32_t length,char * target,int32_t targetCapacity,enum EInvariant) const779 UnicodeString::extract(int32_t start,
780                        int32_t length,
781                        char *target,
782                        int32_t targetCapacity,
783                        enum EInvariant) const
784 {
785   // if the arguments are illegal, then do nothing
786   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
787     return 0;
788   }
789 
790   // pin the indices to legal values
791   pinIndices(start, length);
792 
793   if(length <= targetCapacity) {
794     u_UCharsToChars(getArrayStart() + start, target, length);
795   }
796   UErrorCode status = U_ZERO_ERROR;
797   return u_terminateChars(target, targetCapacity, length, &status);
798 }
799 
800 UnicodeString
tempSubString(int32_t start,int32_t len) const801 UnicodeString::tempSubString(int32_t start, int32_t len) const {
802   pinIndices(start, len);
803   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
804   if(array==NULL) {
805     array=fUnion.fStackFields.fBuffer;  // anything not NULL because that would make an empty string
806     len=-2;  // bogus result string
807   }
808   return UnicodeString(FALSE, array + start, len);
809 }
810 
811 int32_t
toUTF8(int32_t start,int32_t len,char * target,int32_t capacity) const812 UnicodeString::toUTF8(int32_t start, int32_t len,
813                       char *target, int32_t capacity) const {
814   pinIndices(start, len);
815   int32_t length8;
816   UErrorCode errorCode = U_ZERO_ERROR;
817   u_strToUTF8WithSub(target, capacity, &length8,
818                      getBuffer() + start, len,
819                      0xFFFD,  // Standard substitution character.
820                      NULL,    // Don't care about number of substitutions.
821                      &errorCode);
822   return length8;
823 }
824 
825 #if U_CHARSET_IS_UTF8
826 
827 int32_t
extract(int32_t start,int32_t len,char * target,uint32_t dstSize) const828 UnicodeString::extract(int32_t start, int32_t len,
829                        char *target, uint32_t dstSize) const {
830   // if the arguments are illegal, then do nothing
831   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
832     return 0;
833   }
834   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
835 }
836 
837 // else see unistr_cnv.cpp
838 #endif
839 
840 void
extractBetween(int32_t start,int32_t limit,UnicodeString & target) const841 UnicodeString::extractBetween(int32_t start,
842                   int32_t limit,
843                   UnicodeString& target) const {
844   pinIndex(start);
845   pinIndex(limit);
846   doExtract(start, limit - start, target);
847 }
848 
849 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
850 // as many bytes as the source has UChars.
851 // The "worst cases" are writing systems like Indic, Thai and CJK with
852 // 3:1 bytes:UChars.
853 void
toUTF8(ByteSink & sink) const854 UnicodeString::toUTF8(ByteSink &sink) const {
855   int32_t length16 = length();
856   if(length16 != 0) {
857     char stackBuffer[1024];
858     int32_t capacity = (int32_t)sizeof(stackBuffer);
859     UBool utf8IsOwned = FALSE;
860     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
861                                       3*length16,
862                                       stackBuffer, capacity,
863                                       &capacity);
864     int32_t length8 = 0;
865     UErrorCode errorCode = U_ZERO_ERROR;
866     u_strToUTF8WithSub(utf8, capacity, &length8,
867                        getBuffer(), length16,
868                        0xFFFD,  // Standard substitution character.
869                        NULL,    // Don't care about number of substitutions.
870                        &errorCode);
871     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
872       utf8 = (char *)uprv_malloc(length8);
873       if(utf8 != NULL) {
874         utf8IsOwned = TRUE;
875         errorCode = U_ZERO_ERROR;
876         u_strToUTF8WithSub(utf8, length8, &length8,
877                            getBuffer(), length16,
878                            0xFFFD,  // Standard substitution character.
879                            NULL,    // Don't care about number of substitutions.
880                            &errorCode);
881       } else {
882         errorCode = U_MEMORY_ALLOCATION_ERROR;
883       }
884     }
885     if(U_SUCCESS(errorCode)) {
886       sink.Append(utf8, length8);
887       sink.Flush();
888     }
889     if(utf8IsOwned) {
890       uprv_free(utf8);
891     }
892   }
893 }
894 
895 int32_t
toUTF32(UChar32 * utf32,int32_t capacity,UErrorCode & errorCode) const896 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
897   int32_t length32=0;
898   if(U_SUCCESS(errorCode)) {
899     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
900     u_strToUTF32WithSub(utf32, capacity, &length32,
901         getBuffer(), length(),
902         0xfffd,  // Substitution character.
903         NULL,    // Don't care about number of substitutions.
904         &errorCode);
905   }
906   return length32;
907 }
908 
909 int32_t
indexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const910 UnicodeString::indexOf(const UChar *srcChars,
911                int32_t srcStart,
912                int32_t srcLength,
913                int32_t start,
914                int32_t length) const
915 {
916   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
917     return -1;
918   }
919 
920   // UnicodeString does not find empty substrings
921   if(srcLength < 0 && srcChars[srcStart] == 0) {
922     return -1;
923   }
924 
925   // get the indices within bounds
926   pinIndices(start, length);
927 
928   // find the first occurrence of the substring
929   const UChar *array = getArrayStart();
930   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
931   if(match == NULL) {
932     return -1;
933   } else {
934     return (int32_t)(match - array);
935   }
936 }
937 
938 int32_t
doIndexOf(UChar c,int32_t start,int32_t length) const939 UnicodeString::doIndexOf(UChar c,
940              int32_t start,
941              int32_t length) const
942 {
943   // pin indices
944   pinIndices(start, length);
945 
946   // find the first occurrence of c
947   const UChar *array = getArrayStart();
948   const UChar *match = u_memchr(array + start, c, length);
949   if(match == NULL) {
950     return -1;
951   } else {
952     return (int32_t)(match - array);
953   }
954 }
955 
956 int32_t
doIndexOf(UChar32 c,int32_t start,int32_t length) const957 UnicodeString::doIndexOf(UChar32 c,
958                          int32_t start,
959                          int32_t length) const {
960   // pin indices
961   pinIndices(start, length);
962 
963   // find the first occurrence of c
964   const UChar *array = getArrayStart();
965   const UChar *match = u_memchr32(array + start, c, length);
966   if(match == NULL) {
967     return -1;
968   } else {
969     return (int32_t)(match - array);
970   }
971 }
972 
973 int32_t
lastIndexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const974 UnicodeString::lastIndexOf(const UChar *srcChars,
975                int32_t srcStart,
976                int32_t srcLength,
977                int32_t start,
978                int32_t length) const
979 {
980   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
981     return -1;
982   }
983 
984   // UnicodeString does not find empty substrings
985   if(srcLength < 0 && srcChars[srcStart] == 0) {
986     return -1;
987   }
988 
989   // get the indices within bounds
990   pinIndices(start, length);
991 
992   // find the last occurrence of the substring
993   const UChar *array = getArrayStart();
994   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
995   if(match == NULL) {
996     return -1;
997   } else {
998     return (int32_t)(match - array);
999   }
1000 }
1001 
1002 int32_t
doLastIndexOf(UChar c,int32_t start,int32_t length) const1003 UnicodeString::doLastIndexOf(UChar c,
1004                  int32_t start,
1005                  int32_t length) const
1006 {
1007   if(isBogus()) {
1008     return -1;
1009   }
1010 
1011   // pin indices
1012   pinIndices(start, length);
1013 
1014   // find the last occurrence of c
1015   const UChar *array = getArrayStart();
1016   const UChar *match = u_memrchr(array + start, c, length);
1017   if(match == NULL) {
1018     return -1;
1019   } else {
1020     return (int32_t)(match - array);
1021   }
1022 }
1023 
1024 int32_t
doLastIndexOf(UChar32 c,int32_t start,int32_t length) const1025 UnicodeString::doLastIndexOf(UChar32 c,
1026                              int32_t start,
1027                              int32_t length) const {
1028   // pin indices
1029   pinIndices(start, length);
1030 
1031   // find the last occurrence of c
1032   const UChar *array = getArrayStart();
1033   const UChar *match = u_memrchr32(array + start, c, length);
1034   if(match == NULL) {
1035     return -1;
1036   } else {
1037     return (int32_t)(match - array);
1038   }
1039 }
1040 
1041 //========================================
1042 // Write implementation
1043 //========================================
1044 
1045 UnicodeString&
findAndReplace(int32_t start,int32_t length,const UnicodeString & oldText,int32_t oldStart,int32_t oldLength,const UnicodeString & newText,int32_t newStart,int32_t newLength)1046 UnicodeString::findAndReplace(int32_t start,
1047                   int32_t length,
1048                   const UnicodeString& oldText,
1049                   int32_t oldStart,
1050                   int32_t oldLength,
1051                   const UnicodeString& newText,
1052                   int32_t newStart,
1053                   int32_t newLength)
1054 {
1055   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1056     return *this;
1057   }
1058 
1059   pinIndices(start, length);
1060   oldText.pinIndices(oldStart, oldLength);
1061   newText.pinIndices(newStart, newLength);
1062 
1063   if(oldLength == 0) {
1064     return *this;
1065   }
1066 
1067   while(length > 0 && length >= oldLength) {
1068     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1069     if(pos < 0) {
1070       // no more oldText's here: done
1071       break;
1072     } else {
1073       // we found oldText, replace it by newText and go beyond it
1074       replace(pos, oldLength, newText, newStart, newLength);
1075       length -= pos + oldLength - start;
1076       start = pos + newLength;
1077     }
1078   }
1079 
1080   return *this;
1081 }
1082 
1083 
1084 void
setToBogus()1085 UnicodeString::setToBogus()
1086 {
1087   releaseArray();
1088 
1089   fUnion.fFields.fLengthAndFlags = kIsBogus;
1090   fUnion.fFields.fArray = 0;
1091   fUnion.fFields.fCapacity = 0;
1092 }
1093 
1094 // turn a bogus string into an empty one
1095 void
unBogus()1096 UnicodeString::unBogus() {
1097   if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1098     setToEmpty();
1099   }
1100 }
1101 
1102 const UChar *
getTerminatedBuffer()1103 UnicodeString::getTerminatedBuffer() {
1104   if(!isWritable()) {
1105     return 0;
1106   }
1107   UChar *array = getArrayStart();
1108   int32_t len = length();
1109   if(len < getCapacity()) {
1110     if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1111       // If len<capacity on a read-only alias, then array[len] is
1112       // either the original NUL (if constructed with (TRUE, s, length))
1113       // or one of the original string contents characters (if later truncated),
1114       // therefore we can assume that array[len] is initialized memory.
1115       if(array[len] == 0) {
1116         return array;
1117       }
1118     } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1119       // kRefCounted: Do not write the NUL if the buffer is shared.
1120       // That is mostly safe, except when the length of one copy was modified
1121       // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1122       // Then the NUL would be written into the middle of another copy's string.
1123 
1124       // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1125       // Do not test if there is a NUL already because it might be uninitialized memory.
1126       // (That would be safe, but tools like valgrind & Purify would complain.)
1127       array[len] = 0;
1128       return array;
1129     }
1130   }
1131   if(cloneArrayIfNeeded(len+1)) {
1132     array = getArrayStart();
1133     array[len] = 0;
1134     return array;
1135   } else {
1136     return NULL;
1137   }
1138 }
1139 
1140 // setTo() analogous to the readonly-aliasing constructor with the same signature
1141 UnicodeString &
setTo(UBool isTerminated,const UChar * text,int32_t textLength)1142 UnicodeString::setTo(UBool isTerminated,
1143                      const UChar *text,
1144                      int32_t textLength)
1145 {
1146   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1147     // do not modify a string that has an "open" getBuffer(minCapacity)
1148     return *this;
1149   }
1150 
1151   if(text == NULL) {
1152     // treat as an empty string, do not alias
1153     releaseArray();
1154     setToEmpty();
1155     return *this;
1156   }
1157 
1158   if( textLength < -1 ||
1159       (textLength == -1 && !isTerminated) ||
1160       (textLength >= 0 && isTerminated && text[textLength] != 0)
1161   ) {
1162     setToBogus();
1163     return *this;
1164   }
1165 
1166   releaseArray();
1167 
1168   if(textLength == -1) {
1169     // text is terminated, or else it would have failed the above test
1170     textLength = u_strlen(text);
1171   }
1172   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1173   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1174   return *this;
1175 }
1176 
1177 // setTo() analogous to the writable-aliasing constructor with the same signature
1178 UnicodeString &
setTo(UChar * buffer,int32_t buffLength,int32_t buffCapacity)1179 UnicodeString::setTo(UChar *buffer,
1180                      int32_t buffLength,
1181                      int32_t buffCapacity) {
1182   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1183     // do not modify a string that has an "open" getBuffer(minCapacity)
1184     return *this;
1185   }
1186 
1187   if(buffer == NULL) {
1188     // treat as an empty string, do not alias
1189     releaseArray();
1190     setToEmpty();
1191     return *this;
1192   }
1193 
1194   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1195     setToBogus();
1196     return *this;
1197   } else if(buffLength == -1) {
1198     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1199     const UChar *p = buffer, *limit = buffer + buffCapacity;
1200     while(p != limit && *p != 0) {
1201       ++p;
1202     }
1203     buffLength = (int32_t)(p - buffer);
1204   }
1205 
1206   releaseArray();
1207 
1208   fUnion.fFields.fLengthAndFlags = kWritableAlias;
1209   setArray(buffer, buffLength, buffCapacity);
1210   return *this;
1211 }
1212 
setToUTF8(const StringPiece & utf8)1213 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
1214   unBogus();
1215   int32_t length = utf8.length();
1216   int32_t capacity;
1217   // The UTF-16 string will be at most as long as the UTF-8 string.
1218   if(length <= US_STACKBUF_SIZE) {
1219     capacity = US_STACKBUF_SIZE;
1220   } else {
1221     capacity = length + 1;  // +1 for the terminating NUL.
1222   }
1223   UChar *utf16 = getBuffer(capacity);
1224   int32_t length16;
1225   UErrorCode errorCode = U_ZERO_ERROR;
1226   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1227       utf8.data(), length,
1228       0xfffd,  // Substitution character.
1229       NULL,    // Don't care about number of substitutions.
1230       &errorCode);
1231   releaseBuffer(length16);
1232   if(U_FAILURE(errorCode)) {
1233     setToBogus();
1234   }
1235   return *this;
1236 }
1237 
1238 UnicodeString&
setCharAt(int32_t offset,UChar c)1239 UnicodeString::setCharAt(int32_t offset,
1240              UChar c)
1241 {
1242   int32_t len = length();
1243   if(cloneArrayIfNeeded() && len > 0) {
1244     if(offset < 0) {
1245       offset = 0;
1246     } else if(offset >= len) {
1247       offset = len - 1;
1248     }
1249 
1250     getArrayStart()[offset] = c;
1251   }
1252   return *this;
1253 }
1254 
1255 UnicodeString&
replace(int32_t start,int32_t _length,UChar32 srcChar)1256 UnicodeString::replace(int32_t start,
1257                int32_t _length,
1258                UChar32 srcChar) {
1259   UChar buffer[U16_MAX_LENGTH];
1260   int32_t count = 0;
1261   UBool isError = FALSE;
1262   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1263   // We test isError so that the compiler does not complain that we don't.
1264   // If isError (srcChar is not a valid code point) then count==0 which means
1265   // we remove the source segment rather than replacing it with srcChar.
1266   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1267 }
1268 
1269 UnicodeString&
append(UChar32 srcChar)1270 UnicodeString::append(UChar32 srcChar) {
1271   UChar buffer[U16_MAX_LENGTH];
1272   int32_t _length = 0;
1273   UBool isError = FALSE;
1274   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1275   // We test isError so that the compiler does not complain that we don't.
1276   // If isError then _length==0 which turns the doReplace() into a no-op anyway.
1277   return isError ? *this : doReplace(length(), 0, buffer, 0, _length);
1278 }
1279 
1280 UnicodeString&
doReplace(int32_t start,int32_t length,const UnicodeString & src,int32_t srcStart,int32_t srcLength)1281 UnicodeString::doReplace( int32_t start,
1282               int32_t length,
1283               const UnicodeString& src,
1284               int32_t srcStart,
1285               int32_t srcLength)
1286 {
1287   if(!src.isBogus()) {
1288     // pin the indices to legal values
1289     src.pinIndices(srcStart, srcLength);
1290 
1291     // get the characters from src
1292     // and replace the range in ourselves with them
1293     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1294   } else {
1295     // remove the range
1296     return doReplace(start, length, 0, 0, 0);
1297   }
1298 }
1299 
1300 UnicodeString&
doReplace(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength)1301 UnicodeString::doReplace(int32_t start,
1302              int32_t length,
1303              const UChar *srcChars,
1304              int32_t srcStart,
1305              int32_t srcLength)
1306 {
1307   if(!isWritable()) {
1308     return *this;
1309   }
1310 
1311   int32_t oldLength = this->length();
1312 
1313   // optimize (read-only alias).remove(0, start) and .remove(start, end)
1314   if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1315     if(start == 0) {
1316       // remove prefix by adjusting the array pointer
1317       pinIndex(length);
1318       fUnion.fFields.fArray += length;
1319       fUnion.fFields.fCapacity -= length;
1320       setLength(oldLength - length);
1321       return *this;
1322     } else {
1323       pinIndex(start);
1324       if(length >= (oldLength - start)) {
1325         // remove suffix by reducing the length (like truncate())
1326         setLength(start);
1327         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
1328         return *this;
1329       }
1330     }
1331   }
1332 
1333   if(srcChars == 0) {
1334     srcStart = srcLength = 0;
1335   } else if(srcLength < 0) {
1336     // get the srcLength if necessary
1337     srcLength = u_strlen(srcChars + srcStart);
1338   }
1339 
1340   // calculate the size of the string after the replace
1341   int32_t newLength;
1342 
1343   // optimize append() onto a large-enough, owned string
1344   if(start >= oldLength) {
1345     if(srcLength == 0) {
1346       return *this;
1347     }
1348     newLength = oldLength + srcLength;
1349     if(newLength <= getCapacity() && isBufferWritable()) {
1350       UChar *oldArray = getArrayStart();
1351       // Do not copy characters when
1352       //   UChar *buffer=str.getAppendBuffer(...);
1353       // is followed by
1354       //   str.append(buffer, length);
1355       // or
1356       //   str.appendString(buffer, length)
1357       // or similar.
1358       if(srcChars + srcStart != oldArray + start || start > oldLength) {
1359         us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength);
1360       }
1361       setLength(newLength);
1362       return *this;
1363     } else {
1364       // pin the indices to legal values
1365       start = oldLength;
1366       length = 0;
1367     }
1368   } else {
1369     // pin the indices to legal values
1370     pinIndices(start, length);
1371 
1372     newLength = oldLength - length + srcLength;
1373   }
1374 
1375   // the following may change fArray but will not copy the current contents;
1376   // therefore we need to keep the current fArray
1377   UChar oldStackBuffer[US_STACKBUF_SIZE];
1378   UChar *oldArray;
1379   if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1380     // copy the stack buffer contents because it will be overwritten with
1381     // fUnion.fFields values
1382     u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
1383     oldArray = oldStackBuffer;
1384   } else {
1385     oldArray = getArrayStart();
1386   }
1387 
1388   // clone our array and allocate a bigger array if needed
1389   int32_t *bufferToDelete = 0;
1390   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
1391                          FALSE, &bufferToDelete)
1392   ) {
1393     return *this;
1394   }
1395 
1396   // now do the replace
1397 
1398   UChar *newArray = getArrayStart();
1399   if(newArray != oldArray) {
1400     // if fArray changed, then we need to copy everything except what will change
1401     us_arrayCopy(oldArray, 0, newArray, 0, start);
1402     us_arrayCopy(oldArray, start + length,
1403                  newArray, start + srcLength,
1404                  oldLength - (start + length));
1405   } else if(length != srcLength) {
1406     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1407     us_arrayCopy(oldArray, start + length,
1408                  newArray, start + srcLength,
1409                  oldLength - (start + length));
1410   }
1411 
1412   // now fill in the hole with the new string
1413   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1414 
1415   setLength(newLength);
1416 
1417   // delayed delete in case srcChars == fArray when we started, and
1418   // to keep oldArray alive for the above operations
1419   if (bufferToDelete) {
1420     uprv_free(bufferToDelete);
1421   }
1422 
1423   return *this;
1424 }
1425 
1426 /**
1427  * Replaceable API
1428  */
1429 void
handleReplaceBetween(int32_t start,int32_t limit,const UnicodeString & text)1430 UnicodeString::handleReplaceBetween(int32_t start,
1431                                     int32_t limit,
1432                                     const UnicodeString& text) {
1433     replaceBetween(start, limit, text);
1434 }
1435 
1436 /**
1437  * Replaceable API
1438  */
1439 void
copy(int32_t start,int32_t limit,int32_t dest)1440 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1441     if (limit <= start) {
1442         return; // Nothing to do; avoid bogus malloc call
1443     }
1444     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1445     // Check to make sure text is not null.
1446     if (text != NULL) {
1447 	    extractBetween(start, limit, text, 0);
1448 	    insert(dest, text, 0, limit - start);
1449 	    uprv_free(text);
1450     }
1451 }
1452 
1453 /**
1454  * Replaceable API
1455  *
1456  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1457  * so we implement this function here.
1458  */
hasMetaData() const1459 UBool Replaceable::hasMetaData() const {
1460     return TRUE;
1461 }
1462 
1463 /**
1464  * Replaceable API
1465  */
hasMetaData() const1466 UBool UnicodeString::hasMetaData() const {
1467     return FALSE;
1468 }
1469 
1470 UnicodeString&
doReverse(int32_t start,int32_t length)1471 UnicodeString::doReverse(int32_t start, int32_t length) {
1472   if(length <= 1 || !cloneArrayIfNeeded()) {
1473     return *this;
1474   }
1475 
1476   // pin the indices to legal values
1477   pinIndices(start, length);
1478   if(length <= 1) {  // pinIndices() might have shrunk the length
1479     return *this;
1480   }
1481 
1482   UChar *left = getArrayStart() + start;
1483   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
1484   UChar swap;
1485   UBool hasSupplementary = FALSE;
1486 
1487   // Before the loop we know left<right because length>=2.
1488   do {
1489     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1490     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1491     *right-- = swap;
1492   } while(left < right);
1493   // Make sure to test the middle code unit of an odd-length string.
1494   // Redundant if the length is even.
1495   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1496 
1497   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1498   if(hasSupplementary) {
1499     UChar swap2;
1500 
1501     left = getArrayStart() + start;
1502     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1503     while(left < right) {
1504       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1505         *left++ = swap2;
1506         *left++ = swap;
1507       } else {
1508         ++left;
1509       }
1510     }
1511   }
1512 
1513   return *this;
1514 }
1515 
1516 UBool
padLeading(int32_t targetLength,UChar padChar)1517 UnicodeString::padLeading(int32_t targetLength,
1518                           UChar padChar)
1519 {
1520   int32_t oldLength = length();
1521   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1522     return FALSE;
1523   } else {
1524     // move contents up by padding width
1525     UChar *array = getArrayStart();
1526     int32_t start = targetLength - oldLength;
1527     us_arrayCopy(array, 0, array, start, oldLength);
1528 
1529     // fill in padding character
1530     while(--start >= 0) {
1531       array[start] = padChar;
1532     }
1533     setLength(targetLength);
1534     return TRUE;
1535   }
1536 }
1537 
1538 UBool
padTrailing(int32_t targetLength,UChar padChar)1539 UnicodeString::padTrailing(int32_t targetLength,
1540                            UChar padChar)
1541 {
1542   int32_t oldLength = length();
1543   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1544     return FALSE;
1545   } else {
1546     // fill in padding character
1547     UChar *array = getArrayStart();
1548     int32_t length = targetLength;
1549     while(--length >= oldLength) {
1550       array[length] = padChar;
1551     }
1552     setLength(targetLength);
1553     return TRUE;
1554   }
1555 }
1556 
1557 //========================================
1558 // Hashing
1559 //========================================
1560 int32_t
doHashCode() const1561 UnicodeString::doHashCode() const
1562 {
1563     /* Delegate hash computation to uhash.  This makes UnicodeString
1564      * hashing consistent with UChar* hashing.  */
1565     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1566     if (hashCode == kInvalidHashCode) {
1567         hashCode = kEmptyHashCode;
1568     }
1569     return hashCode;
1570 }
1571 
1572 //========================================
1573 // External Buffer
1574 //========================================
1575 
1576 UChar *
getBuffer(int32_t minCapacity)1577 UnicodeString::getBuffer(int32_t minCapacity) {
1578   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1579     fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1580     setZeroLength();
1581     return getArrayStart();
1582   } else {
1583     return 0;
1584   }
1585 }
1586 
1587 void
releaseBuffer(int32_t newLength)1588 UnicodeString::releaseBuffer(int32_t newLength) {
1589   if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1590     // set the new fLength
1591     int32_t capacity=getCapacity();
1592     if(newLength==-1) {
1593       // the new length is the string length, capped by fCapacity
1594       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1595       while(p<limit && *p!=0) {
1596         ++p;
1597       }
1598       newLength=(int32_t)(p-array);
1599     } else if(newLength>capacity) {
1600       newLength=capacity;
1601     }
1602     setLength(newLength);
1603     fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1604   }
1605 }
1606 
1607 //========================================
1608 // Miscellaneous
1609 //========================================
1610 UBool
cloneArrayIfNeeded(int32_t newCapacity,int32_t growCapacity,UBool doCopyArray,int32_t ** pBufferToDelete,UBool forceClone)1611 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1612                                   int32_t growCapacity,
1613                                   UBool doCopyArray,
1614                                   int32_t **pBufferToDelete,
1615                                   UBool forceClone) {
1616   // default parameters need to be static, therefore
1617   // the defaults are -1 to have convenience defaults
1618   if(newCapacity == -1) {
1619     newCapacity = getCapacity();
1620   }
1621 
1622   // while a getBuffer(minCapacity) is "open",
1623   // prevent any modifications of the string by returning FALSE here
1624   // if the string is bogus, then only an assignment or similar can revive it
1625   if(!isWritable()) {
1626     return FALSE;
1627   }
1628 
1629   /*
1630    * We need to make a copy of the array if
1631    * the buffer is read-only, or
1632    * the buffer is refCounted (shared), and refCount>1, or
1633    * the buffer is too small.
1634    * Return FALSE if memory could not be allocated.
1635    */
1636   if(forceClone ||
1637      fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1638      (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1639      newCapacity > getCapacity()
1640   ) {
1641     // check growCapacity for default value and use of the stack buffer
1642     if(growCapacity < 0) {
1643       growCapacity = newCapacity;
1644     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1645       growCapacity = US_STACKBUF_SIZE;
1646     }
1647 
1648     // save old values
1649     UChar oldStackBuffer[US_STACKBUF_SIZE];
1650     UChar *oldArray;
1651     int32_t oldLength = length();
1652     int16_t flags = fUnion.fFields.fLengthAndFlags;
1653 
1654     if(flags&kUsingStackBuffer) {
1655       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1656       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1657         // copy the stack buffer contents because it will be overwritten with
1658         // fUnion.fFields values
1659         us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1660         oldArray = oldStackBuffer;
1661       } else {
1662         oldArray = NULL; // no need to copy from the stack buffer to itself
1663       }
1664     } else {
1665       oldArray = fUnion.fFields.fArray;
1666       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1667     }
1668 
1669     // allocate a new array
1670     if(allocate(growCapacity) ||
1671        (newCapacity < growCapacity && allocate(newCapacity))
1672     ) {
1673       if(doCopyArray) {
1674         // copy the contents
1675         // do not copy more than what fits - it may be smaller than before
1676         int32_t minLength = oldLength;
1677         newCapacity = getCapacity();
1678         if(newCapacity < minLength) {
1679           minLength = newCapacity;
1680         }
1681         if(oldArray != NULL) {
1682           us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1683         }
1684         setLength(minLength);
1685       } else {
1686         setZeroLength();
1687       }
1688 
1689       // release the old array
1690       if(flags & kRefCounted) {
1691         // the array is refCounted; decrement and release if 0
1692         u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1693         if(umtx_atomic_dec(pRefCount) == 0) {
1694           if(pBufferToDelete == 0) {
1695               // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1696               // is defined as volatile. (Volatile has useful non-standard behavior
1697               //   with this compiler.)
1698             uprv_free((void *)pRefCount);
1699           } else {
1700             // the caller requested to delete it himself
1701             *pBufferToDelete = (int32_t *)pRefCount;
1702           }
1703         }
1704       }
1705     } else {
1706       // not enough memory for growCapacity and not even for the smaller newCapacity
1707       // reset the old values for setToBogus() to release the array
1708       if(!(flags&kUsingStackBuffer)) {
1709         fUnion.fFields.fArray = oldArray;
1710       }
1711       fUnion.fFields.fLengthAndFlags = flags;
1712       setToBogus();
1713       return FALSE;
1714     }
1715   }
1716   return TRUE;
1717 }
1718 
1719 // UnicodeStringAppendable ------------------------------------------------- ***
1720 
~UnicodeStringAppendable()1721 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1722 
1723 UBool
appendCodeUnit(UChar c)1724 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1725   return str.doReplace(str.length(), 0, &c, 0, 1).isWritable();
1726 }
1727 
1728 UBool
appendCodePoint(UChar32 c)1729 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1730   UChar buffer[U16_MAX_LENGTH];
1731   int32_t cLength = 0;
1732   UBool isError = FALSE;
1733   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1734   return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable();
1735 }
1736 
1737 UBool
appendString(const UChar * s,int32_t length)1738 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1739   return str.doReplace(str.length(), 0, s, 0, length).isWritable();
1740 }
1741 
1742 UBool
reserveAppendCapacity(int32_t appendCapacity)1743 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1744   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1745 }
1746 
1747 UChar *
getAppendBuffer(int32_t minCapacity,int32_t desiredCapacityHint,UChar * scratch,int32_t scratchCapacity,int32_t * resultCapacity)1748 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1749                                          int32_t desiredCapacityHint,
1750                                          UChar *scratch, int32_t scratchCapacity,
1751                                          int32_t *resultCapacity) {
1752   if(minCapacity < 1 || scratchCapacity < minCapacity) {
1753     *resultCapacity = 0;
1754     return NULL;
1755   }
1756   int32_t oldLength = str.length();
1757   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1758     *resultCapacity = str.getCapacity() - oldLength;
1759     return str.getArrayStart() + oldLength;
1760   }
1761   *resultCapacity = scratchCapacity;
1762   return scratch;
1763 }
1764 
1765 U_NAMESPACE_END
1766 
1767 U_NAMESPACE_USE
1768 
1769 U_CAPI int32_t U_EXPORT2
uhash_hashUnicodeString(const UElement key)1770 uhash_hashUnicodeString(const UElement key) {
1771     const UnicodeString *str = (const UnicodeString*) key.pointer;
1772     return (str == NULL) ? 0 : str->hashCode();
1773 }
1774 
1775 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1776 // does not depend on hashtable code.
1777 U_CAPI UBool U_EXPORT2
uhash_compareUnicodeString(const UElement key1,const UElement key2)1778 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1779     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1780     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1781     if (str1 == str2) {
1782         return TRUE;
1783     }
1784     if (str1 == NULL || str2 == NULL) {
1785         return FALSE;
1786     }
1787     return *str1 == *str2;
1788 }
1789 
1790 #ifdef U_STATIC_IMPLEMENTATION
1791 /*
1792 This should never be called. It is defined here to make sure that the
1793 virtual vector deleting destructor is defined within unistr.cpp.
1794 The vector deleting destructor is already a part of UObject,
1795 but defining it here makes sure that it is included with this object file.
1796 This makes sure that static library dependencies are kept to a minimum.
1797 */
uprv_UnicodeStringDummy(void)1798 static void uprv_UnicodeStringDummy(void) {
1799     delete [] (new UnicodeString[2]);
1800 }
1801 #endif
1802