1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/old_utf_string_conversions.h"
6 
7 #include <stdint.h>
8 
9 #include "base/strings/string_piece.h"
10 #include "base/strings/string_util.h"
11 #include "base/strings/utf_string_conversion_utils.h"
12 #include "build/build_config.h"
13 
14 namespace base_old {
15 
16 using base::IsStringASCII;
17 using base::ReadUnicodeCharacter;
18 using base::WriteUnicodeCharacter;
19 
20 template<typename CHAR>
PrepareForUTF8Output(const CHAR * src,size_t src_len,std::string * output)21 void PrepareForUTF8Output(const CHAR* src,
22                           size_t src_len,
23                           std::string* output) {
24   output->clear();
25   if (src_len == 0)
26     return;
27   if (src[0] < 0x80) {
28     // Assume that the entire input will be ASCII.
29     output->reserve(src_len);
30   } else {
31     // Assume that the entire input is non-ASCII and will have 3 bytes per char.
32     output->reserve(src_len * 3);
33   }
34 }
35 
36 template<typename STRING>
PrepareForUTF16Or32Output(const char * src,size_t src_len,STRING * output)37 void PrepareForUTF16Or32Output(const char* src,
38                                size_t src_len,
39                                STRING* output) {
40   output->clear();
41   if (src_len == 0)
42     return;
43   if (static_cast<unsigned char>(src[0]) < 0x80) {
44     // Assume the input is all ASCII, which means 1:1 correspondence.
45     output->reserve(src_len);
46   } else {
47     // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
48     // character.
49     output->reserve(src_len / 2);
50   }
51 }
52 
53 namespace {
54 
55 // Generalized Unicode converter -----------------------------------------------
56 
57 // Converts the given source Unicode character type to the given destination
58 // Unicode character type as a STL string. The given input buffer and size
59 // determine the source, and the given output STL string will be replaced by
60 // the result.
61 template <typename SRC_CHAR, typename DEST_STRING>
ConvertUnicode(const SRC_CHAR * src,size_t src_len,DEST_STRING * output)62 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) {
63   // ICU requires 32-bit numbers.
64   bool success = true;
65   int32_t src_len32 = static_cast<int32_t>(src_len);
66   for (int32_t i = 0; i < src_len32; i++) {
67     uint32_t code_point;
68     if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
69       WriteUnicodeCharacter(code_point, output);
70     } else {
71       WriteUnicodeCharacter(0xFFFD, output);
72       success = false;
73     }
74   }
75 
76   return success;
77 }
78 
79 }  // namespace
80 
81 // UTF-8 <-> Wide --------------------------------------------------------------
82 
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)83 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
84   if (IsStringASCII(std::wstring(src, src_len))) {
85     output->assign(src, src + src_len);
86     return true;
87   } else {
88     PrepareForUTF8Output(src, src_len, output);
89     return ConvertUnicode(src, src_len, output);
90   }
91 }
92 
WideToUTF8(const std::wstring & wide)93 std::string WideToUTF8(const std::wstring& wide) {
94   if (IsStringASCII(wide)) {
95     return std::string(wide.data(), wide.data() + wide.length());
96   }
97 
98   std::string ret;
99   PrepareForUTF8Output(wide.data(), wide.length(), &ret);
100   ConvertUnicode(wide.data(), wide.length(), &ret);
101   return ret;
102 }
103 
UTF8ToWide(const char * src,size_t src_len,std::wstring * output)104 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
105   if (IsStringASCII(StringPiece(src, src_len))) {
106     output->assign(src, src + src_len);
107     return true;
108   } else {
109     PrepareForUTF16Or32Output(src, src_len, output);
110     return ConvertUnicode(src, src_len, output);
111   }
112 }
113 
UTF8ToWide(StringPiece utf8)114 std::wstring UTF8ToWide(StringPiece utf8) {
115   if (IsStringASCII(utf8)) {
116     return std::wstring(utf8.begin(), utf8.end());
117   }
118 
119   std::wstring ret;
120   PrepareForUTF16Or32Output(utf8.data(), utf8.length(), &ret);
121   ConvertUnicode(utf8.data(), utf8.length(), &ret);
122   return ret;
123 }
124 
125 // UTF-16 <-> Wide -------------------------------------------------------------
126 
127 #if defined(WCHAR_T_IS_UTF16)
128 
129 // When wide == UTF-16, then conversions are a NOP.
WideToUTF16(const wchar_t * src,size_t src_len,string16 * output)130 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
131   output->assign(src, src_len);
132   return true;
133 }
134 
WideToUTF16(const std::wstring & wide)135 string16 WideToUTF16(const std::wstring& wide) {
136   return wide;
137 }
138 
UTF16ToWide(const char16 * src,size_t src_len,std::wstring * output)139 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
140   output->assign(src, src_len);
141   return true;
142 }
143 
UTF16ToWide(const string16 & utf16)144 std::wstring UTF16ToWide(const string16& utf16) {
145   return utf16;
146 }
147 
148 #elif defined(WCHAR_T_IS_UTF32)
149 
WideToUTF16(const wchar_t * src,size_t src_len,string16 * output)150 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
151   output->clear();
152   // Assume that normally we won't have any non-BMP characters so the counts
153   // will be the same.
154   output->reserve(src_len);
155   return ConvertUnicode(src, src_len, output);
156 }
157 
WideToUTF16(const std::wstring & wide)158 string16 WideToUTF16(const std::wstring& wide) {
159   string16 ret;
160   WideToUTF16(wide.data(), wide.length(), &ret);
161   return ret;
162 }
163 
UTF16ToWide(const char16 * src,size_t src_len,std::wstring * output)164 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
165   output->clear();
166   // Assume that normally we won't have any non-BMP characters so the counts
167   // will be the same.
168   output->reserve(src_len);
169   return ConvertUnicode(src, src_len, output);
170 }
171 
UTF16ToWide(const string16 & utf16)172 std::wstring UTF16ToWide(const string16& utf16) {
173   std::wstring ret;
174   UTF16ToWide(utf16.data(), utf16.length(), &ret);
175   return ret;
176 }
177 
178 #endif  // defined(WCHAR_T_IS_UTF32)
179 
180 // UTF16 <-> UTF8 --------------------------------------------------------------
181 
182 #if defined(WCHAR_T_IS_UTF32)
183 
UTF8ToUTF16(const char * src,size_t src_len,string16 * output)184 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
185   if (IsStringASCII(StringPiece(src, src_len))) {
186     output->assign(src, src + src_len);
187     return true;
188   } else {
189     PrepareForUTF16Or32Output(src, src_len, output);
190     return ConvertUnicode(src, src_len, output);
191   }
192 }
193 
UTF8ToUTF16(StringPiece utf8)194 string16 UTF8ToUTF16(StringPiece utf8) {
195   if (IsStringASCII(utf8)) {
196     return string16(utf8.begin(), utf8.end());
197   }
198 
199   string16 ret;
200   PrepareForUTF16Or32Output(utf8.data(), utf8.length(), &ret);
201   // Ignore the success flag of this call, it will do the best it can for
202   // invalid input, which is what we want here.
203   ConvertUnicode(utf8.data(), utf8.length(), &ret);
204   return ret;
205 }
206 
UTF16ToUTF8(const char16 * src,size_t src_len,std::string * output)207 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
208   if (IsStringASCII(StringPiece16(src, src_len))) {
209     output->assign(src, src + src_len);
210     return true;
211   } else {
212     PrepareForUTF8Output(src, src_len, output);
213     return ConvertUnicode(src, src_len, output);
214   }
215 }
216 
UTF16ToUTF8(StringPiece16 utf16)217 std::string UTF16ToUTF8(StringPiece16 utf16) {
218   std::string ret;
219   // Ignore the success flag of this call, it will do the best it can for
220   // invalid input, which is what we want here.
221   UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
222   return ret;
223 }
224 
225 #elif defined(WCHAR_T_IS_UTF16)
226 // Easy case since we can use the "wide" versions we already wrote above.
227 
UTF8ToUTF16(const char * src,size_t src_len,string16 * output)228 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
229   return UTF8ToWide(src, src_len, output);
230 }
231 
UTF8ToUTF16(StringPiece utf8)232 string16 UTF8ToUTF16(StringPiece utf8) {
233   return UTF8ToWide(utf8);
234 }
235 
UTF16ToUTF8(const char16 * src,size_t src_len,std::string * output)236 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
237   return WideToUTF8(src, src_len, output);
238 }
239 
UTF16ToUTF8(StringPiece16 utf16)240 std::string UTF16ToUTF8(StringPiece16 utf16) {
241   if (IsStringASCII(utf16))
242     return std::string(utf16.data(), utf16.data() + utf16.length());
243 
244   std::string ret;
245   PrepareForUTF8Output(utf16.data(), utf16.length(), &ret);
246   ConvertUnicode(utf16.data(), utf16.length(), &ret);
247   return ret;
248 }
249 
250 #endif
251 
ASCIIToUTF16(StringPiece ascii)252 string16 ASCIIToUTF16(StringPiece ascii) {
253   DCHECK(IsStringASCII(ascii)) << ascii;
254   return string16(ascii.begin(), ascii.end());
255 }
256 
UTF16ToASCII(StringPiece16 utf16)257 std::string UTF16ToASCII(StringPiece16 utf16) {
258   DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
259   return std::string(utf16.begin(), utf16.end());
260 }
261 
262 }  // namespace base_old
263