1 /*
2 ******************************************************************************
3 * Copyright (C) 1999-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
6 *
7 * File unistr.cpp
8 *
9 * Modification History:
10 *
11 *   Date        Name        Description
12 *   09/25/98    stephen     Creation.
13 *   04/20/99    stephen     Overhauled per 4/16 code review.
14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
16 *                           Replaceable.
17 *   06/25/01    grhoten     Removed the dependency on iostream
18 ******************************************************************************
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/appendable.h"
23 #include "unicode/putil.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
30 #include "uelement.h"
31 #include "ustr_imp.h"
32 #include "umutex.h"
33 #include "uassert.h"
34 
35 #if 0
36 
37 #include <iostream>
38 using namespace std;
39 
40 //DEBUGGING
41 void
42 print(const UnicodeString& s,
43       const char *name)
44 {
45   UChar c;
46   cout << name << ":|";
47   for(int i = 0; i < s.length(); ++i) {
48     c = s[i];
49     if(c>= 0x007E || c < 0x0020)
50       cout << "[0x" << hex << s[i] << "]";
51     else
52       cout << (char) s[i];
53   }
54   cout << '|' << endl;
55 }
56 
57 void
58 print(const UChar *s,
59       int32_t len,
60       const char *name)
61 {
62   UChar c;
63   cout << name << ":|";
64   for(int i = 0; i < len; ++i) {
65     c = s[i];
66     if(c>= 0x007E || c < 0x0020)
67       cout << "[0x" << hex << s[i] << "]";
68     else
69       cout << (char) s[i];
70   }
71   cout << '|' << endl;
72 }
73 // END DEBUGGING
74 #endif
75 
76 // Local function definitions for now
77 
78 // need to copy areas that may overlap
79 static
80 inline void
us_arrayCopy(const UChar * src,int32_t srcStart,UChar * dst,int32_t dstStart,int32_t count)81 us_arrayCopy(const UChar *src, int32_t srcStart,
82          UChar *dst, int32_t dstStart, int32_t count)
83 {
84   if(count>0) {
85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
86   }
87 }
88 
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
90 U_CDECL_BEGIN
91 static UChar U_CALLCONV
UnicodeString_charAt(int32_t offset,void * context)92 UnicodeString_charAt(int32_t offset, void *context) {
93     return ((icu::UnicodeString*) context)->charAt(offset);
94 }
95 U_CDECL_END
96 
97 U_NAMESPACE_BEGIN
98 
99 /* The Replaceable virtual destructor can't be defined in the header
100    due to how AIX works with multiple definitions of virtual functions.
101 */
~Replaceable()102 Replaceable::~Replaceable() {}
103 
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
105 
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
108     return
109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
110             append(s1).
111                 append(s2);
112 }
113 
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 //                               have a chance to automatically inline.
117 //========================================
118 
119 void
addRef()120 UnicodeString::addRef() {
121   umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
122 }
123 
124 int32_t
removeRef()125 UnicodeString::removeRef() {
126   return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
127 }
128 
129 int32_t
refCount() const130 UnicodeString::refCount() const {
131   return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
132 }
133 
134 void
releaseArray()135 UnicodeString::releaseArray() {
136   if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
137     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
138   }
139 }
140 
141 
142 
143 //========================================
144 // Constructors
145 //========================================
146 
147 // The default constructor is inline in unistr.h.
148 
UnicodeString(int32_t capacity,UChar32 c,int32_t count)149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
150   fUnion.fFields.fLengthAndFlags = 0;
151   if(count <= 0 || (uint32_t)c > 0x10ffff) {
152     // just allocate and do not do anything else
153     allocate(capacity);
154   } else {
155     // count > 0, allocate and fill the new string with count c's
156     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
157     if(capacity < length) {
158       capacity = length;
159     }
160     if(allocate(capacity)) {
161       UChar *array = getArrayStart();
162       int32_t i = 0;
163 
164       // fill the new string with c
165       if(unitCount == 1) {
166         // fill with length UChars
167         while(i < length) {
168           array[i++] = (UChar)c;
169         }
170       } else {
171         // get the code units for c
172         UChar units[U16_MAX_LENGTH];
173         U16_APPEND_UNSAFE(units, i, c);
174 
175         // now it must be i==unitCount
176         i = 0;
177 
178         // for Unicode, unitCount can only be 1, 2, 3, or 4
179         // 1 is handled above
180         while(i < length) {
181           int32_t unitIdx = 0;
182           while(unitIdx < unitCount) {
183             array[i++]=units[unitIdx++];
184           }
185         }
186       }
187     }
188     setLength(length);
189   }
190 }
191 
UnicodeString(UChar ch)192 UnicodeString::UnicodeString(UChar ch) {
193   fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194   fUnion.fStackFields.fBuffer[0] = ch;
195 }
196 
UnicodeString(UChar32 ch)197 UnicodeString::UnicodeString(UChar32 ch) {
198   fUnion.fFields.fLengthAndFlags = kShortString;
199   int32_t i = 0;
200   UBool isError = FALSE;
201   U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202   // We test isError so that the compiler does not complain that we don't.
203   // If isError then i==0 which is what we want anyway.
204   if(!isError) {
205     setShortLength(i);
206   }
207 }
208 
UnicodeString(const UChar * text)209 UnicodeString::UnicodeString(const UChar *text) {
210   fUnion.fFields.fLengthAndFlags = kShortString;
211   doAppend(text, 0, -1);
212 }
213 
UnicodeString(const UChar * text,int32_t textLength)214 UnicodeString::UnicodeString(const UChar *text,
215                              int32_t textLength) {
216   fUnion.fFields.fLengthAndFlags = kShortString;
217   doAppend(text, 0, textLength);
218 }
219 
UnicodeString(UBool isTerminated,const UChar * text,int32_t textLength)220 UnicodeString::UnicodeString(UBool isTerminated,
221                              const UChar *text,
222                              int32_t textLength) {
223   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224   if(text == NULL) {
225     // treat as an empty string, do not alias
226     setToEmpty();
227   } else if(textLength < -1 ||
228             (textLength == -1 && !isTerminated) ||
229             (textLength >= 0 && isTerminated && text[textLength] != 0)
230   ) {
231     setToBogus();
232   } else {
233     if(textLength == -1) {
234       // text is terminated, or else it would have failed the above test
235       textLength = u_strlen(text);
236     }
237     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
238   }
239 }
240 
UnicodeString(UChar * buff,int32_t buffLength,int32_t buffCapacity)241 UnicodeString::UnicodeString(UChar *buff,
242                              int32_t buffLength,
243                              int32_t buffCapacity) {
244   fUnion.fFields.fLengthAndFlags = kWritableAlias;
245   if(buff == NULL) {
246     // treat as an empty string, do not alias
247     setToEmpty();
248   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
249     setToBogus();
250   } else {
251     if(buffLength == -1) {
252       // fLength = u_strlen(buff); but do not look beyond buffCapacity
253       const UChar *p = buff, *limit = buff + buffCapacity;
254       while(p != limit && *p != 0) {
255         ++p;
256       }
257       buffLength = (int32_t)(p - buff);
258     }
259     setArray(buff, buffLength, buffCapacity);
260   }
261 }
262 
UnicodeString(const char * src,int32_t length,EInvariant)263 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
264   fUnion.fFields.fLengthAndFlags = kShortString;
265   if(src==NULL) {
266     // treat as an empty string
267   } else {
268     if(length<0) {
269       length=(int32_t)uprv_strlen(src);
270     }
271     if(cloneArrayIfNeeded(length, length, FALSE)) {
272       u_charsToUChars(src, getArrayStart(), length);
273       setLength(length);
274     } else {
275       setToBogus();
276     }
277   }
278 }
279 
280 #if U_CHARSET_IS_UTF8
281 
UnicodeString(const char * codepageData)282 UnicodeString::UnicodeString(const char *codepageData) {
283   fUnion.fFields.fLengthAndFlags = kShortString;
284   if(codepageData != 0) {
285     setToUTF8(codepageData);
286   }
287 }
288 
UnicodeString(const char * codepageData,int32_t dataLength)289 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
290   fUnion.fFields.fLengthAndFlags = kShortString;
291   // if there's nothing to convert, do nothing
292   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
293     return;
294   }
295   if(dataLength == -1) {
296     dataLength = (int32_t)uprv_strlen(codepageData);
297   }
298   setToUTF8(StringPiece(codepageData, dataLength));
299 }
300 
301 // else see unistr_cnv.cpp
302 #endif
303 
UnicodeString(const UnicodeString & that)304 UnicodeString::UnicodeString(const UnicodeString& that) {
305   fUnion.fFields.fLengthAndFlags = kShortString;
306   copyFrom(that);
307 }
308 
309 #if U_HAVE_RVALUE_REFERENCES
UnicodeString(UnicodeString && src)310 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
311   fUnion.fFields.fLengthAndFlags = kShortString;
312   moveFrom(src);
313 }
314 #endif
315 
UnicodeString(const UnicodeString & that,int32_t srcStart)316 UnicodeString::UnicodeString(const UnicodeString& that,
317                              int32_t srcStart) {
318   fUnion.fFields.fLengthAndFlags = kShortString;
319   setTo(that, srcStart);
320 }
321 
UnicodeString(const UnicodeString & that,int32_t srcStart,int32_t srcLength)322 UnicodeString::UnicodeString(const UnicodeString& that,
323                              int32_t srcStart,
324                              int32_t srcLength) {
325   fUnion.fFields.fLengthAndFlags = kShortString;
326   setTo(that, srcStart, srcLength);
327 }
328 
329 // Replaceable base class clone() default implementation, does not clone
330 Replaceable *
clone() const331 Replaceable::clone() const {
332   return NULL;
333 }
334 
335 // UnicodeString overrides clone() with a real implementation
336 Replaceable *
clone() const337 UnicodeString::clone() const {
338   return new UnicodeString(*this);
339 }
340 
341 //========================================
342 // array allocation
343 //========================================
344 
345 UBool
allocate(int32_t capacity)346 UnicodeString::allocate(int32_t capacity) {
347   if(capacity <= US_STACKBUF_SIZE) {
348     fUnion.fFields.fLengthAndFlags = kShortString;
349   } else {
350     // count bytes for the refCounter and the string capacity, and
351     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
352     // to be safely aligned for the refCount
353     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
354     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
355     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
356     if(array != 0) {
357       // set initial refCount and point behind the refCount
358       *array++ = 1;
359 
360       // have fArray point to the first UChar
361       fUnion.fFields.fArray = (UChar *)array;
362       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
363       fUnion.fFields.fLengthAndFlags = kLongString;
364     } else {
365       fUnion.fFields.fLengthAndFlags = kIsBogus;
366       fUnion.fFields.fArray = 0;
367       fUnion.fFields.fCapacity = 0;
368       return FALSE;
369     }
370   }
371   return TRUE;
372 }
373 
374 //========================================
375 // Destructor
376 //========================================
377 
378 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
379 static u_atomic_int32_t finalLengthCounts[0x400];  // UnicodeString::kMaxShortLength+1
380 static u_atomic_int32_t beyondCount(0);
381 
unistr_printLengths()382 U_CAPI void unistr_printLengths() {
383   int32_t i;
384   for(i = 0; i <= 59; ++i) {
385     printf("%2d,  %9d\n", i, (int32_t)finalLengthCounts[i]);
386   }
387   int32_t beyond = beyondCount;
388   for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
389     beyond += finalLengthCounts[i];
390   }
391   printf(">59, %9d\n", beyond);
392 }
393 #endif
394 
~UnicodeString()395 UnicodeString::~UnicodeString()
396 {
397 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
398   // Count lengths of strings at the end of their lifetime.
399   // Useful for discussion of a desirable stack buffer size.
400   // Count the contents length, not the optional NUL terminator nor further capacity.
401   // Ignore open-buffer strings and strings which alias external storage.
402   if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
403     if(hasShortLength()) {
404       umtx_atomic_inc(finalLengthCounts + getShortLength());
405     } else {
406       umtx_atomic_inc(&beyondCount);
407     }
408   }
409 #endif
410 
411   releaseArray();
412 }
413 
414 //========================================
415 // Factory methods
416 //========================================
417 
fromUTF8(const StringPiece & utf8)418 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
419   UnicodeString result;
420   result.setToUTF8(utf8);
421   return result;
422 }
423 
fromUTF32(const UChar32 * utf32,int32_t length)424 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
425   UnicodeString result;
426   int32_t capacity;
427   // Most UTF-32 strings will be BMP-only and result in a same-length
428   // UTF-16 string. We overestimate the capacity just slightly,
429   // just in case there are a few supplementary characters.
430   if(length <= US_STACKBUF_SIZE) {
431     capacity = US_STACKBUF_SIZE;
432   } else {
433     capacity = length + (length >> 4) + 4;
434   }
435   do {
436     UChar *utf16 = result.getBuffer(capacity);
437     int32_t length16;
438     UErrorCode errorCode = U_ZERO_ERROR;
439     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
440         utf32, length,
441         0xfffd,  // Substitution character.
442         NULL,    // Don't care about number of substitutions.
443         &errorCode);
444     result.releaseBuffer(length16);
445     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
446       capacity = length16 + 1;  // +1 for the terminating NUL.
447       continue;
448     } else if(U_FAILURE(errorCode)) {
449       result.setToBogus();
450     }
451     break;
452   } while(TRUE);
453   return result;
454 }
455 
456 //========================================
457 // Assignment
458 //========================================
459 
460 UnicodeString &
operator =(const UnicodeString & src)461 UnicodeString::operator=(const UnicodeString &src) {
462   return copyFrom(src);
463 }
464 
465 UnicodeString &
fastCopyFrom(const UnicodeString & src)466 UnicodeString::fastCopyFrom(const UnicodeString &src) {
467   return copyFrom(src, TRUE);
468 }
469 
470 UnicodeString &
copyFrom(const UnicodeString & src,UBool fastCopy)471 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
472   // if assigning to ourselves, do nothing
473   if(this == &src) {
474     return *this;
475   }
476 
477   // is the right side bogus?
478   if(src.isBogus()) {
479     setToBogus();
480     return *this;
481   }
482 
483   // delete the current contents
484   releaseArray();
485 
486   if(src.isEmpty()) {
487     // empty string - use the stack buffer
488     setToEmpty();
489     return *this;
490   }
491 
492   // fLength>0 and not an "open" src.getBuffer(minCapacity)
493   fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
494   switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
495   case kShortString:
496     // short string using the stack buffer, do the same
497     uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
498                 getShortLength() * U_SIZEOF_UCHAR);
499     break;
500   case kLongString:
501     // src uses a refCounted string buffer, use that buffer with refCount
502     // src is const, use a cast - we don't actually change it
503     ((UnicodeString &)src).addRef();
504     // copy all fields, share the reference-counted buffer
505     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
506     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
507     if(!hasShortLength()) {
508       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
509     }
510     break;
511   case kReadonlyAlias:
512     if(fastCopy) {
513       // src is a readonly alias, do the same
514       // -> maintain the readonly alias as such
515       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
516       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
517       if(!hasShortLength()) {
518         fUnion.fFields.fLength = src.fUnion.fFields.fLength;
519       }
520       break;
521     }
522     // else if(!fastCopy) fall through to case kWritableAlias
523     // -> allocate a new buffer and copy the contents
524   case kWritableAlias: {
525     // src is a writable alias; we make a copy of that instead
526     int32_t srcLength = src.length();
527     if(allocate(srcLength)) {
528       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
529       setLength(srcLength);
530       break;
531     }
532     // if there is not enough memory, then fall through to setting to bogus
533   }
534   default:
535     // if src is bogus, set ourselves to bogus
536     // do not call setToBogus() here because fArray and flags are not consistent here
537     fUnion.fFields.fLengthAndFlags = kIsBogus;
538     fUnion.fFields.fArray = 0;
539     fUnion.fFields.fCapacity = 0;
540     break;
541   }
542 
543   return *this;
544 }
545 
moveFrom(UnicodeString & src)546 UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT {
547   // No explicit check for self move assignment, consistent with standard library.
548   // Self move assignment causes no crash nor leak but might make the object bogus.
549   releaseArray();
550   copyFieldsFrom(src, TRUE);
551   return *this;
552 }
553 
554 // Same as moveFrom() except without memory management.
copyFieldsFrom(UnicodeString & src,UBool setSrcToBogus)555 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
556   int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
557   if(lengthAndFlags & kUsingStackBuffer) {
558     // Short string using the stack buffer, copy the contents.
559     // Check for self assignment to prevent "overlap in memcpy" warnings,
560     // although it should be harmless to copy a buffer to itself exactly.
561     if(this != &src) {
562       uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
563                   getShortLength() * U_SIZEOF_UCHAR);
564     }
565   } else {
566     // In all other cases, copy all fields.
567     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
568     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
569     if(!hasShortLength()) {
570       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
571     }
572     if(setSrcToBogus) {
573       // Set src to bogus without releasing any memory.
574       src.fUnion.fFields.fLengthAndFlags = kIsBogus;
575       src.fUnion.fFields.fArray = NULL;
576       src.fUnion.fFields.fCapacity = 0;
577     }
578   }
579 }
580 
swap(UnicodeString & other)581 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
582   UnicodeString temp;  // Empty short string: Known not to need releaseArray().
583   // Copy fields without resetting source values in between.
584   temp.copyFieldsFrom(*this, FALSE);
585   this->copyFieldsFrom(other, FALSE);
586   other.copyFieldsFrom(temp, FALSE);
587   // Set temp to an empty string so that other's memory is not released twice.
588   temp.fUnion.fFields.fLengthAndFlags = kShortString;
589 }
590 
591 //========================================
592 // Miscellaneous operations
593 //========================================
594 
unescape() const595 UnicodeString UnicodeString::unescape() const {
596     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
597     if (result.isBogus()) {
598         return result;
599     }
600     const UChar *array = getBuffer();
601     int32_t len = length();
602     int32_t prev = 0;
603     for (int32_t i=0;;) {
604         if (i == len) {
605             result.append(array, prev, len - prev);
606             break;
607         }
608         if (array[i++] == 0x5C /*'\\'*/) {
609             result.append(array, prev, (i - 1) - prev);
610             UChar32 c = unescapeAt(i); // advances i
611             if (c < 0) {
612                 result.remove(); // return empty string
613                 break; // invalid escape sequence
614             }
615             result.append(c);
616             prev = i;
617         }
618     }
619     return result;
620 }
621 
unescapeAt(int32_t & offset) const622 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
623     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
624 }
625 
626 //========================================
627 // Read-only implementation
628 //========================================
629 UBool
doEquals(const UnicodeString & text,int32_t len) const630 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
631   // Requires: this & text not bogus and have same lengths.
632   // Byte-wise comparison works for equality regardless of endianness.
633   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
634 }
635 
636 int8_t
doCompare(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const637 UnicodeString::doCompare( int32_t start,
638               int32_t length,
639               const UChar *srcChars,
640               int32_t srcStart,
641               int32_t srcLength) const
642 {
643   // compare illegal string values
644   if(isBogus()) {
645     return -1;
646   }
647 
648   // pin indices to legal values
649   pinIndices(start, length);
650 
651   if(srcChars == NULL) {
652     // treat const UChar *srcChars==NULL as an empty string
653     return length == 0 ? 0 : 1;
654   }
655 
656   // get the correct pointer
657   const UChar *chars = getArrayStart();
658 
659   chars += start;
660   srcChars += srcStart;
661 
662   int32_t minLength;
663   int8_t lengthResult;
664 
665   // get the srcLength if necessary
666   if(srcLength < 0) {
667     srcLength = u_strlen(srcChars + srcStart);
668   }
669 
670   // are we comparing different lengths?
671   if(length != srcLength) {
672     if(length < srcLength) {
673       minLength = length;
674       lengthResult = -1;
675     } else {
676       minLength = srcLength;
677       lengthResult = 1;
678     }
679   } else {
680     minLength = length;
681     lengthResult = 0;
682   }
683 
684   /*
685    * note that uprv_memcmp() returns an int but we return an int8_t;
686    * we need to take care not to truncate the result -
687    * one way to do this is to right-shift the value to
688    * move the sign bit into the lower 8 bits and making sure that this
689    * does not become 0 itself
690    */
691 
692   if(minLength > 0 && chars != srcChars) {
693     int32_t result;
694 
695 #   if U_IS_BIG_ENDIAN
696       // big-endian: byte comparison works
697       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
698       if(result != 0) {
699         return (int8_t)(result >> 15 | 1);
700       }
701 #   else
702       // little-endian: compare UChar units
703       do {
704         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
705         if(result != 0) {
706           return (int8_t)(result >> 15 | 1);
707         }
708       } while(--minLength > 0);
709 #   endif
710   }
711   return lengthResult;
712 }
713 
714 /* String compare in code point order - doCompare() compares in code unit order. */
715 int8_t
doCompareCodePointOrder(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const716 UnicodeString::doCompareCodePointOrder(int32_t start,
717                                        int32_t length,
718                                        const UChar *srcChars,
719                                        int32_t srcStart,
720                                        int32_t srcLength) const
721 {
722   // compare illegal string values
723   // treat const UChar *srcChars==NULL as an empty string
724   if(isBogus()) {
725     return -1;
726   }
727 
728   // pin indices to legal values
729   pinIndices(start, length);
730 
731   if(srcChars == NULL) {
732     srcStart = srcLength = 0;
733   }
734 
735   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
736   /* translate the 32-bit result into an 8-bit one */
737   if(diff!=0) {
738     return (int8_t)(diff >> 15 | 1);
739   } else {
740     return 0;
741   }
742 }
743 
744 int32_t
getLength() const745 UnicodeString::getLength() const {
746     return length();
747 }
748 
749 UChar
getCharAt(int32_t offset) const750 UnicodeString::getCharAt(int32_t offset) const {
751   return charAt(offset);
752 }
753 
754 UChar32
getChar32At(int32_t offset) const755 UnicodeString::getChar32At(int32_t offset) const {
756   return char32At(offset);
757 }
758 
759 UChar32
char32At(int32_t offset) const760 UnicodeString::char32At(int32_t offset) const
761 {
762   int32_t len = length();
763   if((uint32_t)offset < (uint32_t)len) {
764     const UChar *array = getArrayStart();
765     UChar32 c;
766     U16_GET(array, 0, offset, len, c);
767     return c;
768   } else {
769     return kInvalidUChar;
770   }
771 }
772 
773 int32_t
getChar32Start(int32_t offset) const774 UnicodeString::getChar32Start(int32_t offset) const {
775   if((uint32_t)offset < (uint32_t)length()) {
776     const UChar *array = getArrayStart();
777     U16_SET_CP_START(array, 0, offset);
778     return offset;
779   } else {
780     return 0;
781   }
782 }
783 
784 int32_t
getChar32Limit(int32_t offset) const785 UnicodeString::getChar32Limit(int32_t offset) const {
786   int32_t len = length();
787   if((uint32_t)offset < (uint32_t)len) {
788     const UChar *array = getArrayStart();
789     U16_SET_CP_LIMIT(array, 0, offset, len);
790     return offset;
791   } else {
792     return len;
793   }
794 }
795 
796 int32_t
countChar32(int32_t start,int32_t length) const797 UnicodeString::countChar32(int32_t start, int32_t length) const {
798   pinIndices(start, length);
799   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
800   return u_countChar32(getArrayStart()+start, length);
801 }
802 
803 UBool
hasMoreChar32Than(int32_t start,int32_t length,int32_t number) const804 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
805   pinIndices(start, length);
806   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
807   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
808 }
809 
810 int32_t
moveIndex32(int32_t index,int32_t delta) const811 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
812   // pin index
813   int32_t len = length();
814   if(index<0) {
815     index=0;
816   } else if(index>len) {
817     index=len;
818   }
819 
820   const UChar *array = getArrayStart();
821   if(delta>0) {
822     U16_FWD_N(array, index, len, delta);
823   } else {
824     U16_BACK_N(array, 0, index, -delta);
825   }
826 
827   return index;
828 }
829 
830 void
doExtract(int32_t start,int32_t length,UChar * dst,int32_t dstStart) const831 UnicodeString::doExtract(int32_t start,
832              int32_t length,
833              UChar *dst,
834              int32_t dstStart) const
835 {
836   // pin indices to legal values
837   pinIndices(start, length);
838 
839   // do not copy anything if we alias dst itself
840   const UChar *array = getArrayStart();
841   if(array + start != dst + dstStart) {
842     us_arrayCopy(array, start, dst, dstStart, length);
843   }
844 }
845 
846 int32_t
extract(UChar * dest,int32_t destCapacity,UErrorCode & errorCode) const847 UnicodeString::extract(UChar *dest, int32_t destCapacity,
848                        UErrorCode &errorCode) const {
849   int32_t len = length();
850   if(U_SUCCESS(errorCode)) {
851     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
852       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
853     } else {
854       const UChar *array = getArrayStart();
855       if(len>0 && len<=destCapacity && array!=dest) {
856         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
857       }
858       return u_terminateUChars(dest, destCapacity, len, &errorCode);
859     }
860   }
861 
862   return len;
863 }
864 
865 int32_t
extract(int32_t start,int32_t length,char * target,int32_t targetCapacity,enum EInvariant) const866 UnicodeString::extract(int32_t start,
867                        int32_t length,
868                        char *target,
869                        int32_t targetCapacity,
870                        enum EInvariant) const
871 {
872   // if the arguments are illegal, then do nothing
873   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
874     return 0;
875   }
876 
877   // pin the indices to legal values
878   pinIndices(start, length);
879 
880   if(length <= targetCapacity) {
881     u_UCharsToChars(getArrayStart() + start, target, length);
882   }
883   UErrorCode status = U_ZERO_ERROR;
884   return u_terminateChars(target, targetCapacity, length, &status);
885 }
886 
887 UnicodeString
tempSubString(int32_t start,int32_t len) const888 UnicodeString::tempSubString(int32_t start, int32_t len) const {
889   pinIndices(start, len);
890   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
891   if(array==NULL) {
892     array=fUnion.fStackFields.fBuffer;  // anything not NULL because that would make an empty string
893     len=-2;  // bogus result string
894   }
895   return UnicodeString(FALSE, array + start, len);
896 }
897 
898 int32_t
toUTF8(int32_t start,int32_t len,char * target,int32_t capacity) const899 UnicodeString::toUTF8(int32_t start, int32_t len,
900                       char *target, int32_t capacity) const {
901   pinIndices(start, len);
902   int32_t length8;
903   UErrorCode errorCode = U_ZERO_ERROR;
904   u_strToUTF8WithSub(target, capacity, &length8,
905                      getBuffer() + start, len,
906                      0xFFFD,  // Standard substitution character.
907                      NULL,    // Don't care about number of substitutions.
908                      &errorCode);
909   return length8;
910 }
911 
912 #if U_CHARSET_IS_UTF8
913 
914 int32_t
extract(int32_t start,int32_t len,char * target,uint32_t dstSize) const915 UnicodeString::extract(int32_t start, int32_t len,
916                        char *target, uint32_t dstSize) const {
917   // if the arguments are illegal, then do nothing
918   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
919     return 0;
920   }
921   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
922 }
923 
924 // else see unistr_cnv.cpp
925 #endif
926 
927 void
extractBetween(int32_t start,int32_t limit,UnicodeString & target) const928 UnicodeString::extractBetween(int32_t start,
929                   int32_t limit,
930                   UnicodeString& target) const {
931   pinIndex(start);
932   pinIndex(limit);
933   doExtract(start, limit - start, target);
934 }
935 
936 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
937 // as many bytes as the source has UChars.
938 // The "worst cases" are writing systems like Indic, Thai and CJK with
939 // 3:1 bytes:UChars.
940 void
toUTF8(ByteSink & sink) const941 UnicodeString::toUTF8(ByteSink &sink) const {
942   int32_t length16 = length();
943   if(length16 != 0) {
944     char stackBuffer[1024];
945     int32_t capacity = (int32_t)sizeof(stackBuffer);
946     UBool utf8IsOwned = FALSE;
947     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
948                                       3*length16,
949                                       stackBuffer, capacity,
950                                       &capacity);
951     int32_t length8 = 0;
952     UErrorCode errorCode = U_ZERO_ERROR;
953     u_strToUTF8WithSub(utf8, capacity, &length8,
954                        getBuffer(), length16,
955                        0xFFFD,  // Standard substitution character.
956                        NULL,    // Don't care about number of substitutions.
957                        &errorCode);
958     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
959       utf8 = (char *)uprv_malloc(length8);
960       if(utf8 != NULL) {
961         utf8IsOwned = TRUE;
962         errorCode = U_ZERO_ERROR;
963         u_strToUTF8WithSub(utf8, length8, &length8,
964                            getBuffer(), length16,
965                            0xFFFD,  // Standard substitution character.
966                            NULL,    // Don't care about number of substitutions.
967                            &errorCode);
968       } else {
969         errorCode = U_MEMORY_ALLOCATION_ERROR;
970       }
971     }
972     if(U_SUCCESS(errorCode)) {
973       sink.Append(utf8, length8);
974       sink.Flush();
975     }
976     if(utf8IsOwned) {
977       uprv_free(utf8);
978     }
979   }
980 }
981 
982 int32_t
toUTF32(UChar32 * utf32,int32_t capacity,UErrorCode & errorCode) const983 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
984   int32_t length32=0;
985   if(U_SUCCESS(errorCode)) {
986     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
987     u_strToUTF32WithSub(utf32, capacity, &length32,
988         getBuffer(), length(),
989         0xfffd,  // Substitution character.
990         NULL,    // Don't care about number of substitutions.
991         &errorCode);
992   }
993   return length32;
994 }
995 
996 int32_t
indexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const997 UnicodeString::indexOf(const UChar *srcChars,
998                int32_t srcStart,
999                int32_t srcLength,
1000                int32_t start,
1001                int32_t length) const
1002 {
1003   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1004     return -1;
1005   }
1006 
1007   // UnicodeString does not find empty substrings
1008   if(srcLength < 0 && srcChars[srcStart] == 0) {
1009     return -1;
1010   }
1011 
1012   // get the indices within bounds
1013   pinIndices(start, length);
1014 
1015   // find the first occurrence of the substring
1016   const UChar *array = getArrayStart();
1017   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1018   if(match == NULL) {
1019     return -1;
1020   } else {
1021     return (int32_t)(match - array);
1022   }
1023 }
1024 
1025 int32_t
doIndexOf(UChar c,int32_t start,int32_t length) const1026 UnicodeString::doIndexOf(UChar c,
1027              int32_t start,
1028              int32_t length) const
1029 {
1030   // pin indices
1031   pinIndices(start, length);
1032 
1033   // find the first occurrence of c
1034   const UChar *array = getArrayStart();
1035   const UChar *match = u_memchr(array + start, c, length);
1036   if(match == NULL) {
1037     return -1;
1038   } else {
1039     return (int32_t)(match - array);
1040   }
1041 }
1042 
1043 int32_t
doIndexOf(UChar32 c,int32_t start,int32_t length) const1044 UnicodeString::doIndexOf(UChar32 c,
1045                          int32_t start,
1046                          int32_t length) const {
1047   // pin indices
1048   pinIndices(start, length);
1049 
1050   // find the first occurrence of c
1051   const UChar *array = getArrayStart();
1052   const UChar *match = u_memchr32(array + start, c, length);
1053   if(match == NULL) {
1054     return -1;
1055   } else {
1056     return (int32_t)(match - array);
1057   }
1058 }
1059 
1060 int32_t
lastIndexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const1061 UnicodeString::lastIndexOf(const UChar *srcChars,
1062                int32_t srcStart,
1063                int32_t srcLength,
1064                int32_t start,
1065                int32_t length) const
1066 {
1067   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1068     return -1;
1069   }
1070 
1071   // UnicodeString does not find empty substrings
1072   if(srcLength < 0 && srcChars[srcStart] == 0) {
1073     return -1;
1074   }
1075 
1076   // get the indices within bounds
1077   pinIndices(start, length);
1078 
1079   // find the last occurrence of the substring
1080   const UChar *array = getArrayStart();
1081   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1082   if(match == NULL) {
1083     return -1;
1084   } else {
1085     return (int32_t)(match - array);
1086   }
1087 }
1088 
1089 int32_t
doLastIndexOf(UChar c,int32_t start,int32_t length) const1090 UnicodeString::doLastIndexOf(UChar c,
1091                  int32_t start,
1092                  int32_t length) const
1093 {
1094   if(isBogus()) {
1095     return -1;
1096   }
1097 
1098   // pin indices
1099   pinIndices(start, length);
1100 
1101   // find the last occurrence of c
1102   const UChar *array = getArrayStart();
1103   const UChar *match = u_memrchr(array + start, c, length);
1104   if(match == NULL) {
1105     return -1;
1106   } else {
1107     return (int32_t)(match - array);
1108   }
1109 }
1110 
1111 int32_t
doLastIndexOf(UChar32 c,int32_t start,int32_t length) const1112 UnicodeString::doLastIndexOf(UChar32 c,
1113                              int32_t start,
1114                              int32_t length) const {
1115   // pin indices
1116   pinIndices(start, length);
1117 
1118   // find the last occurrence of c
1119   const UChar *array = getArrayStart();
1120   const UChar *match = u_memrchr32(array + start, c, length);
1121   if(match == NULL) {
1122     return -1;
1123   } else {
1124     return (int32_t)(match - array);
1125   }
1126 }
1127 
1128 //========================================
1129 // Write implementation
1130 //========================================
1131 
1132 UnicodeString&
findAndReplace(int32_t start,int32_t length,const UnicodeString & oldText,int32_t oldStart,int32_t oldLength,const UnicodeString & newText,int32_t newStart,int32_t newLength)1133 UnicodeString::findAndReplace(int32_t start,
1134                   int32_t length,
1135                   const UnicodeString& oldText,
1136                   int32_t oldStart,
1137                   int32_t oldLength,
1138                   const UnicodeString& newText,
1139                   int32_t newStart,
1140                   int32_t newLength)
1141 {
1142   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1143     return *this;
1144   }
1145 
1146   pinIndices(start, length);
1147   oldText.pinIndices(oldStart, oldLength);
1148   newText.pinIndices(newStart, newLength);
1149 
1150   if(oldLength == 0) {
1151     return *this;
1152   }
1153 
1154   while(length > 0 && length >= oldLength) {
1155     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1156     if(pos < 0) {
1157       // no more oldText's here: done
1158       break;
1159     } else {
1160       // we found oldText, replace it by newText and go beyond it
1161       replace(pos, oldLength, newText, newStart, newLength);
1162       length -= pos + oldLength - start;
1163       start = pos + newLength;
1164     }
1165   }
1166 
1167   return *this;
1168 }
1169 
1170 
1171 void
setToBogus()1172 UnicodeString::setToBogus()
1173 {
1174   releaseArray();
1175 
1176   fUnion.fFields.fLengthAndFlags = kIsBogus;
1177   fUnion.fFields.fArray = 0;
1178   fUnion.fFields.fCapacity = 0;
1179 }
1180 
1181 // turn a bogus string into an empty one
1182 void
unBogus()1183 UnicodeString::unBogus() {
1184   if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1185     setToEmpty();
1186   }
1187 }
1188 
1189 const UChar *
getTerminatedBuffer()1190 UnicodeString::getTerminatedBuffer() {
1191   if(!isWritable()) {
1192     return 0;
1193   }
1194   UChar *array = getArrayStart();
1195   int32_t len = length();
1196   if(len < getCapacity()) {
1197     if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1198       // If len<capacity on a read-only alias, then array[len] is
1199       // either the original NUL (if constructed with (TRUE, s, length))
1200       // or one of the original string contents characters (if later truncated),
1201       // therefore we can assume that array[len] is initialized memory.
1202       if(array[len] == 0) {
1203         return array;
1204       }
1205     } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1206       // kRefCounted: Do not write the NUL if the buffer is shared.
1207       // That is mostly safe, except when the length of one copy was modified
1208       // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1209       // Then the NUL would be written into the middle of another copy's string.
1210 
1211       // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1212       // Do not test if there is a NUL already because it might be uninitialized memory.
1213       // (That would be safe, but tools like valgrind & Purify would complain.)
1214       array[len] = 0;
1215       return array;
1216     }
1217   }
1218   if(cloneArrayIfNeeded(len+1)) {
1219     array = getArrayStart();
1220     array[len] = 0;
1221     return array;
1222   } else {
1223     return NULL;
1224   }
1225 }
1226 
1227 // setTo() analogous to the readonly-aliasing constructor with the same signature
1228 UnicodeString &
setTo(UBool isTerminated,const UChar * text,int32_t textLength)1229 UnicodeString::setTo(UBool isTerminated,
1230                      const UChar *text,
1231                      int32_t textLength)
1232 {
1233   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1234     // do not modify a string that has an "open" getBuffer(minCapacity)
1235     return *this;
1236   }
1237 
1238   if(text == NULL) {
1239     // treat as an empty string, do not alias
1240     releaseArray();
1241     setToEmpty();
1242     return *this;
1243   }
1244 
1245   if( textLength < -1 ||
1246       (textLength == -1 && !isTerminated) ||
1247       (textLength >= 0 && isTerminated && text[textLength] != 0)
1248   ) {
1249     setToBogus();
1250     return *this;
1251   }
1252 
1253   releaseArray();
1254 
1255   if(textLength == -1) {
1256     // text is terminated, or else it would have failed the above test
1257     textLength = u_strlen(text);
1258   }
1259   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1260   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1261   return *this;
1262 }
1263 
1264 // setTo() analogous to the writable-aliasing constructor with the same signature
1265 UnicodeString &
setTo(UChar * buffer,int32_t buffLength,int32_t buffCapacity)1266 UnicodeString::setTo(UChar *buffer,
1267                      int32_t buffLength,
1268                      int32_t buffCapacity) {
1269   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1270     // do not modify a string that has an "open" getBuffer(minCapacity)
1271     return *this;
1272   }
1273 
1274   if(buffer == NULL) {
1275     // treat as an empty string, do not alias
1276     releaseArray();
1277     setToEmpty();
1278     return *this;
1279   }
1280 
1281   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1282     setToBogus();
1283     return *this;
1284   } else if(buffLength == -1) {
1285     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1286     const UChar *p = buffer, *limit = buffer + buffCapacity;
1287     while(p != limit && *p != 0) {
1288       ++p;
1289     }
1290     buffLength = (int32_t)(p - buffer);
1291   }
1292 
1293   releaseArray();
1294 
1295   fUnion.fFields.fLengthAndFlags = kWritableAlias;
1296   setArray(buffer, buffLength, buffCapacity);
1297   return *this;
1298 }
1299 
setToUTF8(const StringPiece & utf8)1300 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
1301   unBogus();
1302   int32_t length = utf8.length();
1303   int32_t capacity;
1304   // The UTF-16 string will be at most as long as the UTF-8 string.
1305   if(length <= US_STACKBUF_SIZE) {
1306     capacity = US_STACKBUF_SIZE;
1307   } else {
1308     capacity = length + 1;  // +1 for the terminating NUL.
1309   }
1310   UChar *utf16 = getBuffer(capacity);
1311   int32_t length16;
1312   UErrorCode errorCode = U_ZERO_ERROR;
1313   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1314       utf8.data(), length,
1315       0xfffd,  // Substitution character.
1316       NULL,    // Don't care about number of substitutions.
1317       &errorCode);
1318   releaseBuffer(length16);
1319   if(U_FAILURE(errorCode)) {
1320     setToBogus();
1321   }
1322   return *this;
1323 }
1324 
1325 UnicodeString&
setCharAt(int32_t offset,UChar c)1326 UnicodeString::setCharAt(int32_t offset,
1327              UChar c)
1328 {
1329   int32_t len = length();
1330   if(cloneArrayIfNeeded() && len > 0) {
1331     if(offset < 0) {
1332       offset = 0;
1333     } else if(offset >= len) {
1334       offset = len - 1;
1335     }
1336 
1337     getArrayStart()[offset] = c;
1338   }
1339   return *this;
1340 }
1341 
1342 UnicodeString&
replace(int32_t start,int32_t _length,UChar32 srcChar)1343 UnicodeString::replace(int32_t start,
1344                int32_t _length,
1345                UChar32 srcChar) {
1346   UChar buffer[U16_MAX_LENGTH];
1347   int32_t count = 0;
1348   UBool isError = FALSE;
1349   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1350   // We test isError so that the compiler does not complain that we don't.
1351   // If isError (srcChar is not a valid code point) then count==0 which means
1352   // we remove the source segment rather than replacing it with srcChar.
1353   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1354 }
1355 
1356 UnicodeString&
append(UChar32 srcChar)1357 UnicodeString::append(UChar32 srcChar) {
1358   UChar buffer[U16_MAX_LENGTH];
1359   int32_t _length = 0;
1360   UBool isError = FALSE;
1361   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1362   // We test isError so that the compiler does not complain that we don't.
1363   // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1364   return isError ? *this : doAppend(buffer, 0, _length);
1365 }
1366 
1367 UnicodeString&
doReplace(int32_t start,int32_t length,const UnicodeString & src,int32_t srcStart,int32_t srcLength)1368 UnicodeString::doReplace( int32_t start,
1369               int32_t length,
1370               const UnicodeString& src,
1371               int32_t srcStart,
1372               int32_t srcLength)
1373 {
1374   // pin the indices to legal values
1375   src.pinIndices(srcStart, srcLength);
1376 
1377   // get the characters from src
1378   // and replace the range in ourselves with them
1379   return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1380 }
1381 
1382 UnicodeString&
doReplace(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength)1383 UnicodeString::doReplace(int32_t start,
1384              int32_t length,
1385              const UChar *srcChars,
1386              int32_t srcStart,
1387              int32_t srcLength)
1388 {
1389   if(!isWritable()) {
1390     return *this;
1391   }
1392 
1393   int32_t oldLength = this->length();
1394 
1395   // optimize (read-only alias).remove(0, start) and .remove(start, end)
1396   if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1397     if(start == 0) {
1398       // remove prefix by adjusting the array pointer
1399       pinIndex(length);
1400       fUnion.fFields.fArray += length;
1401       fUnion.fFields.fCapacity -= length;
1402       setLength(oldLength - length);
1403       return *this;
1404     } else {
1405       pinIndex(start);
1406       if(length >= (oldLength - start)) {
1407         // remove suffix by reducing the length (like truncate())
1408         setLength(start);
1409         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
1410         return *this;
1411       }
1412     }
1413   }
1414 
1415   if(start == oldLength) {
1416     return doAppend(srcChars, srcStart, srcLength);
1417   }
1418 
1419   if(srcChars == 0) {
1420     srcStart = srcLength = 0;
1421   } else if(srcLength < 0) {
1422     // get the srcLength if necessary
1423     srcLength = u_strlen(srcChars + srcStart);
1424   }
1425 
1426   // pin the indices to legal values
1427   pinIndices(start, length);
1428 
1429   // calculate the size of the string after the replace
1430   int32_t newLength = oldLength - length + srcLength;
1431 
1432   // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1433   // therefore we need to keep the current fArray
1434   UChar oldStackBuffer[US_STACKBUF_SIZE];
1435   UChar *oldArray;
1436   if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1437     // copy the stack buffer contents because it will be overwritten with
1438     // fUnion.fFields values
1439     u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
1440     oldArray = oldStackBuffer;
1441   } else {
1442     oldArray = getArrayStart();
1443   }
1444 
1445   // clone our array and allocate a bigger array if needed
1446   int32_t *bufferToDelete = 0;
1447   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
1448                          FALSE, &bufferToDelete)
1449   ) {
1450     return *this;
1451   }
1452 
1453   // now do the replace
1454 
1455   UChar *newArray = getArrayStart();
1456   if(newArray != oldArray) {
1457     // if fArray changed, then we need to copy everything except what will change
1458     us_arrayCopy(oldArray, 0, newArray, 0, start);
1459     us_arrayCopy(oldArray, start + length,
1460                  newArray, start + srcLength,
1461                  oldLength - (start + length));
1462   } else if(length != srcLength) {
1463     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1464     us_arrayCopy(oldArray, start + length,
1465                  newArray, start + srcLength,
1466                  oldLength - (start + length));
1467   }
1468 
1469   // now fill in the hole with the new string
1470   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1471 
1472   setLength(newLength);
1473 
1474   // delayed delete in case srcChars == fArray when we started, and
1475   // to keep oldArray alive for the above operations
1476   if (bufferToDelete) {
1477     uprv_free(bufferToDelete);
1478   }
1479 
1480   return *this;
1481 }
1482 
1483 // Versions of doReplace() only for append() variants.
1484 // doReplace() and doAppend() optimize for different cases.
1485 
1486 UnicodeString&
doAppend(const UnicodeString & src,int32_t srcStart,int32_t srcLength)1487 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1488   if(srcLength == 0) {
1489     return *this;
1490   }
1491 
1492   // pin the indices to legal values
1493   src.pinIndices(srcStart, srcLength);
1494   return doAppend(src.getArrayStart(), srcStart, srcLength);
1495 }
1496 
1497 UnicodeString&
doAppend(const UChar * srcChars,int32_t srcStart,int32_t srcLength)1498 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1499   if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1500     return *this;
1501   }
1502 
1503   if(srcLength < 0) {
1504     // get the srcLength if necessary
1505     if((srcLength = u_strlen(srcChars + srcStart)) == 0) {
1506       return *this;
1507     }
1508   }
1509 
1510   int32_t oldLength = length();
1511   int32_t newLength = oldLength + srcLength;
1512   // optimize append() onto a large-enough, owned string
1513   if((newLength <= getCapacity() && isBufferWritable()) ||
1514       cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize)) {
1515     UChar *newArray = getArrayStart();
1516     // Do not copy characters when
1517     //   UChar *buffer=str.getAppendBuffer(...);
1518     // is followed by
1519     //   str.append(buffer, length);
1520     // or
1521     //   str.appendString(buffer, length)
1522     // or similar.
1523     if(srcChars + srcStart != newArray + oldLength) {
1524       us_arrayCopy(srcChars, srcStart, newArray, oldLength, srcLength);
1525     }
1526     setLength(newLength);
1527   }
1528   return *this;
1529 }
1530 
1531 /**
1532  * Replaceable API
1533  */
1534 void
handleReplaceBetween(int32_t start,int32_t limit,const UnicodeString & text)1535 UnicodeString::handleReplaceBetween(int32_t start,
1536                                     int32_t limit,
1537                                     const UnicodeString& text) {
1538     replaceBetween(start, limit, text);
1539 }
1540 
1541 /**
1542  * Replaceable API
1543  */
1544 void
copy(int32_t start,int32_t limit,int32_t dest)1545 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1546     if (limit <= start) {
1547         return; // Nothing to do; avoid bogus malloc call
1548     }
1549     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1550     // Check to make sure text is not null.
1551     if (text != NULL) {
1552 	    extractBetween(start, limit, text, 0);
1553 	    insert(dest, text, 0, limit - start);
1554 	    uprv_free(text);
1555     }
1556 }
1557 
1558 /**
1559  * Replaceable API
1560  *
1561  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1562  * so we implement this function here.
1563  */
hasMetaData() const1564 UBool Replaceable::hasMetaData() const {
1565     return TRUE;
1566 }
1567 
1568 /**
1569  * Replaceable API
1570  */
hasMetaData() const1571 UBool UnicodeString::hasMetaData() const {
1572     return FALSE;
1573 }
1574 
1575 UnicodeString&
doReverse(int32_t start,int32_t length)1576 UnicodeString::doReverse(int32_t start, int32_t length) {
1577   if(length <= 1 || !cloneArrayIfNeeded()) {
1578     return *this;
1579   }
1580 
1581   // pin the indices to legal values
1582   pinIndices(start, length);
1583   if(length <= 1) {  // pinIndices() might have shrunk the length
1584     return *this;
1585   }
1586 
1587   UChar *left = getArrayStart() + start;
1588   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
1589   UChar swap;
1590   UBool hasSupplementary = FALSE;
1591 
1592   // Before the loop we know left<right because length>=2.
1593   do {
1594     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1595     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1596     *right-- = swap;
1597   } while(left < right);
1598   // Make sure to test the middle code unit of an odd-length string.
1599   // Redundant if the length is even.
1600   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1601 
1602   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1603   if(hasSupplementary) {
1604     UChar swap2;
1605 
1606     left = getArrayStart() + start;
1607     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1608     while(left < right) {
1609       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1610         *left++ = swap2;
1611         *left++ = swap;
1612       } else {
1613         ++left;
1614       }
1615     }
1616   }
1617 
1618   return *this;
1619 }
1620 
1621 UBool
padLeading(int32_t targetLength,UChar padChar)1622 UnicodeString::padLeading(int32_t targetLength,
1623                           UChar padChar)
1624 {
1625   int32_t oldLength = length();
1626   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1627     return FALSE;
1628   } else {
1629     // move contents up by padding width
1630     UChar *array = getArrayStart();
1631     int32_t start = targetLength - oldLength;
1632     us_arrayCopy(array, 0, array, start, oldLength);
1633 
1634     // fill in padding character
1635     while(--start >= 0) {
1636       array[start] = padChar;
1637     }
1638     setLength(targetLength);
1639     return TRUE;
1640   }
1641 }
1642 
1643 UBool
padTrailing(int32_t targetLength,UChar padChar)1644 UnicodeString::padTrailing(int32_t targetLength,
1645                            UChar padChar)
1646 {
1647   int32_t oldLength = length();
1648   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1649     return FALSE;
1650   } else {
1651     // fill in padding character
1652     UChar *array = getArrayStart();
1653     int32_t length = targetLength;
1654     while(--length >= oldLength) {
1655       array[length] = padChar;
1656     }
1657     setLength(targetLength);
1658     return TRUE;
1659   }
1660 }
1661 
1662 //========================================
1663 // Hashing
1664 //========================================
1665 int32_t
doHashCode() const1666 UnicodeString::doHashCode() const
1667 {
1668     /* Delegate hash computation to uhash.  This makes UnicodeString
1669      * hashing consistent with UChar* hashing.  */
1670     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1671     if (hashCode == kInvalidHashCode) {
1672         hashCode = kEmptyHashCode;
1673     }
1674     return hashCode;
1675 }
1676 
1677 //========================================
1678 // External Buffer
1679 //========================================
1680 
1681 UChar *
getBuffer(int32_t minCapacity)1682 UnicodeString::getBuffer(int32_t minCapacity) {
1683   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1684     fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1685     setZeroLength();
1686     return getArrayStart();
1687   } else {
1688     return 0;
1689   }
1690 }
1691 
1692 void
releaseBuffer(int32_t newLength)1693 UnicodeString::releaseBuffer(int32_t newLength) {
1694   if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1695     // set the new fLength
1696     int32_t capacity=getCapacity();
1697     if(newLength==-1) {
1698       // the new length is the string length, capped by fCapacity
1699       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1700       while(p<limit && *p!=0) {
1701         ++p;
1702       }
1703       newLength=(int32_t)(p-array);
1704     } else if(newLength>capacity) {
1705       newLength=capacity;
1706     }
1707     setLength(newLength);
1708     fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1709   }
1710 }
1711 
1712 //========================================
1713 // Miscellaneous
1714 //========================================
1715 UBool
cloneArrayIfNeeded(int32_t newCapacity,int32_t growCapacity,UBool doCopyArray,int32_t ** pBufferToDelete,UBool forceClone)1716 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1717                                   int32_t growCapacity,
1718                                   UBool doCopyArray,
1719                                   int32_t **pBufferToDelete,
1720                                   UBool forceClone) {
1721   // default parameters need to be static, therefore
1722   // the defaults are -1 to have convenience defaults
1723   if(newCapacity == -1) {
1724     newCapacity = getCapacity();
1725   }
1726 
1727   // while a getBuffer(minCapacity) is "open",
1728   // prevent any modifications of the string by returning FALSE here
1729   // if the string is bogus, then only an assignment or similar can revive it
1730   if(!isWritable()) {
1731     return FALSE;
1732   }
1733 
1734   /*
1735    * We need to make a copy of the array if
1736    * the buffer is read-only, or
1737    * the buffer is refCounted (shared), and refCount>1, or
1738    * the buffer is too small.
1739    * Return FALSE if memory could not be allocated.
1740    */
1741   if(forceClone ||
1742      fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1743      (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1744      newCapacity > getCapacity()
1745   ) {
1746     // check growCapacity for default value and use of the stack buffer
1747     if(growCapacity < 0) {
1748       growCapacity = newCapacity;
1749     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1750       growCapacity = US_STACKBUF_SIZE;
1751     }
1752 
1753     // save old values
1754     UChar oldStackBuffer[US_STACKBUF_SIZE];
1755     UChar *oldArray;
1756     int32_t oldLength = length();
1757     int16_t flags = fUnion.fFields.fLengthAndFlags;
1758 
1759     if(flags&kUsingStackBuffer) {
1760       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1761       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1762         // copy the stack buffer contents because it will be overwritten with
1763         // fUnion.fFields values
1764         us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1765         oldArray = oldStackBuffer;
1766       } else {
1767         oldArray = NULL; // no need to copy from the stack buffer to itself
1768       }
1769     } else {
1770       oldArray = fUnion.fFields.fArray;
1771       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1772     }
1773 
1774     // allocate a new array
1775     if(allocate(growCapacity) ||
1776        (newCapacity < growCapacity && allocate(newCapacity))
1777     ) {
1778       if(doCopyArray) {
1779         // copy the contents
1780         // do not copy more than what fits - it may be smaller than before
1781         int32_t minLength = oldLength;
1782         newCapacity = getCapacity();
1783         if(newCapacity < minLength) {
1784           minLength = newCapacity;
1785         }
1786         if(oldArray != NULL) {
1787           us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1788         }
1789         setLength(minLength);
1790       } else {
1791         setZeroLength();
1792       }
1793 
1794       // release the old array
1795       if(flags & kRefCounted) {
1796         // the array is refCounted; decrement and release if 0
1797         u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1798         if(umtx_atomic_dec(pRefCount) == 0) {
1799           if(pBufferToDelete == 0) {
1800               // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1801               // is defined as volatile. (Volatile has useful non-standard behavior
1802               //   with this compiler.)
1803             uprv_free((void *)pRefCount);
1804           } else {
1805             // the caller requested to delete it himself
1806             *pBufferToDelete = (int32_t *)pRefCount;
1807           }
1808         }
1809       }
1810     } else {
1811       // not enough memory for growCapacity and not even for the smaller newCapacity
1812       // reset the old values for setToBogus() to release the array
1813       if(!(flags&kUsingStackBuffer)) {
1814         fUnion.fFields.fArray = oldArray;
1815       }
1816       fUnion.fFields.fLengthAndFlags = flags;
1817       setToBogus();
1818       return FALSE;
1819     }
1820   }
1821   return TRUE;
1822 }
1823 
1824 // UnicodeStringAppendable ------------------------------------------------- ***
1825 
~UnicodeStringAppendable()1826 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1827 
1828 UBool
appendCodeUnit(UChar c)1829 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1830   return str.doAppend(&c, 0, 1).isWritable();
1831 }
1832 
1833 UBool
appendCodePoint(UChar32 c)1834 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1835   UChar buffer[U16_MAX_LENGTH];
1836   int32_t cLength = 0;
1837   UBool isError = FALSE;
1838   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1839   return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1840 }
1841 
1842 UBool
appendString(const UChar * s,int32_t length)1843 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1844   return str.doAppend(s, 0, length).isWritable();
1845 }
1846 
1847 UBool
reserveAppendCapacity(int32_t appendCapacity)1848 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1849   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1850 }
1851 
1852 UChar *
getAppendBuffer(int32_t minCapacity,int32_t desiredCapacityHint,UChar * scratch,int32_t scratchCapacity,int32_t * resultCapacity)1853 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1854                                          int32_t desiredCapacityHint,
1855                                          UChar *scratch, int32_t scratchCapacity,
1856                                          int32_t *resultCapacity) {
1857   if(minCapacity < 1 || scratchCapacity < minCapacity) {
1858     *resultCapacity = 0;
1859     return NULL;
1860   }
1861   int32_t oldLength = str.length();
1862   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1863     *resultCapacity = str.getCapacity() - oldLength;
1864     return str.getArrayStart() + oldLength;
1865   }
1866   *resultCapacity = scratchCapacity;
1867   return scratch;
1868 }
1869 
1870 U_NAMESPACE_END
1871 
1872 U_NAMESPACE_USE
1873 
1874 U_CAPI int32_t U_EXPORT2
uhash_hashUnicodeString(const UElement key)1875 uhash_hashUnicodeString(const UElement key) {
1876     const UnicodeString *str = (const UnicodeString*) key.pointer;
1877     return (str == NULL) ? 0 : str->hashCode();
1878 }
1879 
1880 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1881 // does not depend on hashtable code.
1882 U_CAPI UBool U_EXPORT2
uhash_compareUnicodeString(const UElement key1,const UElement key2)1883 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1884     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1885     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1886     if (str1 == str2) {
1887         return TRUE;
1888     }
1889     if (str1 == NULL || str2 == NULL) {
1890         return FALSE;
1891     }
1892     return *str1 == *str2;
1893 }
1894 
1895 #ifdef U_STATIC_IMPLEMENTATION
1896 /*
1897 This should never be called. It is defined here to make sure that the
1898 virtual vector deleting destructor is defined within unistr.cpp.
1899 The vector deleting destructor is already a part of UObject,
1900 but defining it here makes sure that it is included with this object file.
1901 This makes sure that static library dependencies are kept to a minimum.
1902 */
uprv_UnicodeStringDummy(void)1903 static void uprv_UnicodeStringDummy(void) {
1904     delete [] (new UnicodeString[2]);
1905 }
1906 #endif
1907