1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  unistr_cnv.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:2
14 *
15 *   created on: 2004aug19
16 *   created by: Markus W. Scherer
17 *
18 *   Character conversion functions moved here from unistr.cpp
19 */
20 
21 #include "unicode/utypes.h"
22 
23 #if !UCONFIG_NO_CONVERSION
24 
25 #include "unicode/putil.h"
26 #include "cstring.h"
27 #include "cmemory.h"
28 #include "unicode/ustring.h"
29 #include "unicode/unistr.h"
30 #include "unicode/ucnv.h"
31 #include "ucnv_imp.h"
32 #include "putilimp.h"
33 #include "ustr_cnv.h"
34 #include "ustr_imp.h"
35 
36 U_NAMESPACE_BEGIN
37 
38 //========================================
39 // Constructors
40 //========================================
41 
42 #if !U_CHARSET_IS_UTF8
43 
UnicodeString(const char * codepageData)44 UnicodeString::UnicodeString(const char *codepageData) {
45     fUnion.fFields.fLengthAndFlags = kShortString;
46     if(codepageData != 0) {
47         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
48     }
49 }
50 
UnicodeString(const char * codepageData,int32_t dataLength)51 UnicodeString::UnicodeString(const char *codepageData,
52                              int32_t dataLength) {
53     fUnion.fFields.fLengthAndFlags = kShortString;
54     if(codepageData != 0) {
55         doCodepageCreate(codepageData, dataLength, 0);
56     }
57 }
58 
59 // else see unistr.cpp
60 #endif
61 
UnicodeString(const char * codepageData,const char * codepage)62 UnicodeString::UnicodeString(const char *codepageData,
63                              const char *codepage) {
64     fUnion.fFields.fLengthAndFlags = kShortString;
65     if(codepageData != 0) {
66         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
67     }
68 }
69 
UnicodeString(const char * codepageData,int32_t dataLength,const char * codepage)70 UnicodeString::UnicodeString(const char *codepageData,
71                              int32_t dataLength,
72                              const char *codepage) {
73     fUnion.fFields.fLengthAndFlags = kShortString;
74     if(codepageData != 0) {
75         doCodepageCreate(codepageData, dataLength, codepage);
76     }
77 }
78 
UnicodeString(const char * src,int32_t srcLength,UConverter * cnv,UErrorCode & errorCode)79 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
80                              UConverter *cnv,
81                              UErrorCode &errorCode) {
82     fUnion.fFields.fLengthAndFlags = kShortString;
83     if(U_SUCCESS(errorCode)) {
84         // check arguments
85         if(src==NULL) {
86             // treat as an empty string, do nothing more
87         } else if(srcLength<-1) {
88             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
89         } else {
90             // get input length
91             if(srcLength==-1) {
92                 srcLength=(int32_t)uprv_strlen(src);
93             }
94             if(srcLength>0) {
95                 if(cnv!=0) {
96                     // use the provided converter
97                     ucnv_resetToUnicode(cnv);
98                     doCodepageCreate(src, srcLength, cnv, errorCode);
99                 } else {
100                     // use the default converter
101                     cnv=u_getDefaultConverter(&errorCode);
102                     doCodepageCreate(src, srcLength, cnv, errorCode);
103                     u_releaseDefaultConverter(cnv);
104                 }
105             }
106         }
107 
108         if(U_FAILURE(errorCode)) {
109             setToBogus();
110         }
111     }
112 }
113 
114 //========================================
115 // Codeset conversion
116 //========================================
117 
118 #if !U_CHARSET_IS_UTF8
119 
120 int32_t
extract(int32_t start,int32_t length,char * target,uint32_t dstSize) const121 UnicodeString::extract(int32_t start,
122                        int32_t length,
123                        char *target,
124                        uint32_t dstSize) const {
125     return extract(start, length, target, dstSize, 0);
126 }
127 
128 // else see unistr.cpp
129 #endif
130 
131 int32_t
extract(int32_t start,int32_t length,char * target,uint32_t dstSize,const char * codepage) const132 UnicodeString::extract(int32_t start,
133                        int32_t length,
134                        char *target,
135                        uint32_t dstSize,
136                        const char *codepage) const
137 {
138     // if the arguments are illegal, then do nothing
139     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
140         return 0;
141     }
142 
143     // pin the indices to legal values
144     pinIndices(start, length);
145 
146     // We need to cast dstSize to int32_t for all subsequent code.
147     // I don't know why the API was defined with uint32_t but we are stuck with it.
148     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
149     // as a limit in some functions, it may wrap around and yield a pointer
150     // that compares less-than target.
151     int32_t capacity;
152     if(dstSize < 0x7fffffff) {
153         // Assume that the capacity is real and a limit pointer won't wrap around.
154         capacity = (int32_t)dstSize;
155     } else {
156         // Pin the capacity so that a limit pointer does not wrap around.
157         char *targetLimit = (char *)U_MAX_PTR(target);
158         // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
159         // greater than target and does not wrap around the top of the address space.
160         capacity = (int32_t)(targetLimit - target);
161     }
162 
163     // create the converter
164     UConverter *converter;
165     UErrorCode status = U_ZERO_ERROR;
166 
167     // just write the NUL if the string length is 0
168     if(length == 0) {
169         return u_terminateChars(target, capacity, 0, &status);
170     }
171 
172     // if the codepage is the default, use our cache
173     // if it is an empty string, then use the "invariant character" conversion
174     if (codepage == 0) {
175         const char *defaultName = ucnv_getDefaultName();
176         if(UCNV_FAST_IS_UTF8(defaultName)) {
177             return toUTF8(start, length, target, capacity);
178         }
179         converter = u_getDefaultConverter(&status);
180     } else if (*codepage == 0) {
181         // use the "invariant characters" conversion
182         int32_t destLength;
183         if(length <= capacity) {
184             destLength = length;
185         } else {
186             destLength = capacity;
187         }
188         u_UCharsToChars(getArrayStart() + start, target, destLength);
189         return u_terminateChars(target, capacity, length, &status);
190     } else {
191         converter = ucnv_open(codepage, &status);
192     }
193 
194     length = doExtract(start, length, target, capacity, converter, status);
195 
196     // close the converter
197     if (codepage == 0) {
198         u_releaseDefaultConverter(converter);
199     } else {
200         ucnv_close(converter);
201     }
202 
203     return length;
204 }
205 
206 int32_t
extract(char * dest,int32_t destCapacity,UConverter * cnv,UErrorCode & errorCode) const207 UnicodeString::extract(char *dest, int32_t destCapacity,
208                        UConverter *cnv,
209                        UErrorCode &errorCode) const
210 {
211     if(U_FAILURE(errorCode)) {
212         return 0;
213     }
214 
215     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
216         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
217         return 0;
218     }
219 
220     // nothing to do?
221     if(isEmpty()) {
222         return u_terminateChars(dest, destCapacity, 0, &errorCode);
223     }
224 
225     // get the converter
226     UBool isDefaultConverter;
227     if(cnv==0) {
228         isDefaultConverter=TRUE;
229         cnv=u_getDefaultConverter(&errorCode);
230         if(U_FAILURE(errorCode)) {
231             return 0;
232         }
233     } else {
234         isDefaultConverter=FALSE;
235         ucnv_resetFromUnicode(cnv);
236     }
237 
238     // convert
239     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
240 
241     // release the converter
242     if(isDefaultConverter) {
243         u_releaseDefaultConverter(cnv);
244     }
245 
246     return len;
247 }
248 
249 int32_t
doExtract(int32_t start,int32_t length,char * dest,int32_t destCapacity,UConverter * cnv,UErrorCode & errorCode) const250 UnicodeString::doExtract(int32_t start, int32_t length,
251                          char *dest, int32_t destCapacity,
252                          UConverter *cnv,
253                          UErrorCode &errorCode) const
254 {
255     if(U_FAILURE(errorCode)) {
256         if(destCapacity!=0) {
257             *dest=0;
258         }
259         return 0;
260     }
261 
262     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
263     char *originalDest=dest;
264     const char *destLimit;
265 
266     if(destCapacity==0) {
267         destLimit=dest=0;
268     } else if(destCapacity==-1) {
269         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
270         destLimit=(char*)U_MAX_PTR(dest);
271         // for NUL-termination, translate into highest int32_t
272         destCapacity=0x7fffffff;
273     } else {
274         destLimit=dest+destCapacity;
275     }
276 
277     // perform the conversion
278     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
279     length=(int32_t)(dest-originalDest);
280 
281     // if an overflow occurs, then get the preflighting length
282     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
283         char buffer[1024];
284 
285         destLimit=buffer+sizeof(buffer);
286         do {
287             dest=buffer;
288             errorCode=U_ZERO_ERROR;
289             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
290             length+=(int32_t)(dest-buffer);
291         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
292     }
293 
294     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
295 }
296 
297 void
doCodepageCreate(const char * codepageData,int32_t dataLength,const char * codepage)298 UnicodeString::doCodepageCreate(const char *codepageData,
299                                 int32_t dataLength,
300                                 const char *codepage)
301 {
302     // if there's nothing to convert, do nothing
303     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
304         return;
305     }
306     if(dataLength == -1) {
307         dataLength = (int32_t)uprv_strlen(codepageData);
308     }
309 
310     UErrorCode status = U_ZERO_ERROR;
311 
312     // create the converter
313     // if the codepage is the default, use our cache
314     // if it is an empty string, then use the "invariant character" conversion
315     UConverter *converter;
316     if (codepage == 0) {
317         const char *defaultName = ucnv_getDefaultName();
318         if(UCNV_FAST_IS_UTF8(defaultName)) {
319             setToUTF8(StringPiece(codepageData, dataLength));
320             return;
321         }
322         converter = u_getDefaultConverter(&status);
323     } else if(*codepage == 0) {
324         // use the "invariant characters" conversion
325         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
326             u_charsToUChars(codepageData, getArrayStart(), dataLength);
327             setLength(dataLength);
328         } else {
329             setToBogus();
330         }
331         return;
332     } else {
333         converter = ucnv_open(codepage, &status);
334     }
335 
336     // if we failed, set the appropriate flags and return
337     if(U_FAILURE(status)) {
338         setToBogus();
339         return;
340     }
341 
342     // perform the conversion
343     doCodepageCreate(codepageData, dataLength, converter, status);
344     if(U_FAILURE(status)) {
345         setToBogus();
346     }
347 
348     // close the converter
349     if(codepage == 0) {
350         u_releaseDefaultConverter(converter);
351     } else {
352         ucnv_close(converter);
353     }
354 }
355 
356 void
doCodepageCreate(const char * codepageData,int32_t dataLength,UConverter * converter,UErrorCode & status)357 UnicodeString::doCodepageCreate(const char *codepageData,
358                                 int32_t dataLength,
359                                 UConverter *converter,
360                                 UErrorCode &status)
361 {
362     if(U_FAILURE(status)) {
363         return;
364     }
365 
366     // set up the conversion parameters
367     const char *mySource     = codepageData;
368     const char *mySourceEnd  = mySource + dataLength;
369     UChar *array, *myTarget;
370 
371     // estimate the size needed:
372     int32_t arraySize;
373     if(dataLength <= US_STACKBUF_SIZE) {
374         // try to use the stack buffer
375         arraySize = US_STACKBUF_SIZE;
376     } else {
377         // 1.25 UChar's per source byte should cover most cases
378         arraySize = dataLength + (dataLength >> 2);
379     }
380 
381     // we do not care about the current contents
382     UBool doCopyArray = FALSE;
383     for(;;) {
384         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
385             setToBogus();
386             break;
387         }
388 
389         // perform the conversion
390         array = getArrayStart();
391         myTarget = array + length();
392         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
393             &mySource, mySourceEnd, 0, TRUE, &status);
394 
395         // update the conversion parameters
396         setLength((int32_t)(myTarget - array));
397 
398         // allocate more space and copy data, if needed
399         if(status == U_BUFFER_OVERFLOW_ERROR) {
400             // reset the error code
401             status = U_ZERO_ERROR;
402 
403             // keep the previous conversion results
404             doCopyArray = TRUE;
405 
406             // estimate the new size needed, larger than before
407             // try 2 UChar's per remaining source byte
408             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
409         } else {
410             break;
411         }
412     }
413 }
414 
415 U_NAMESPACE_END
416 
417 #endif
418