1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "compile/Pseudolocalizer.h"
18
19 #include "util/Util.h"
20
21 using android::StringPiece;
22
23 using namespace std::literals;
24
25 namespace aapt {
26
27 // String basis to generate expansion
28 static constexpr auto kExpansionString =
29 "one two three "
30 "four five six seven eight nine ten eleven twelve thirteen "
31 "fourteen fiveteen sixteen seventeen nineteen twenty"sv;
32
33 // Special unicode characters to override directionality of the words
34 static constexpr auto kRlm = "\u200f"sv;
35 static constexpr auto kRlo = "\u202e"sv;
36 static constexpr auto kPdf = "\u202c"sv;
37
38 // Placeholder marks
39 static constexpr auto kPlaceholderOpen = "\u00bb"sv;
40 static constexpr auto kPlaceholderClose = "\u00ab"sv;
41
42 static const char kArgStart = '{';
43 static const char kArgEnd = '}';
44
45 class PseudoMethodNone : public PseudoMethodImpl {
46 public:
Text(StringPiece text)47 std::string Text(StringPiece text) override {
48 return std::string(text);
49 }
Placeholder(StringPiece text)50 std::string Placeholder(StringPiece text) override {
51 return std::string(text);
52 }
53 };
54
55 class PseudoMethodBidi : public PseudoMethodImpl {
56 public:
57 std::string Text(StringPiece text) override;
58 std::string Placeholder(StringPiece text) override;
59 };
60
61 class PseudoMethodAccent : public PseudoMethodImpl {
62 public:
PseudoMethodAccent()63 PseudoMethodAccent() : depth_(0), word_count_(0), length_(0) {}
64 std::string Start() override;
65 std::string End() override;
66 std::string Text(StringPiece text) override;
67 std::string Placeholder(StringPiece text) override;
68
69 private:
70 size_t depth_;
71 size_t word_count_;
72 size_t length_;
73 };
74
Pseudolocalizer(Method method)75 Pseudolocalizer::Pseudolocalizer(Method method) : last_depth_(0) {
76 SetMethod(method);
77 }
78
SetMethod(Method method)79 void Pseudolocalizer::SetMethod(Method method) {
80 switch (method) {
81 case Method::kNone:
82 impl_ = util::make_unique<PseudoMethodNone>();
83 break;
84 case Method::kAccent:
85 impl_ = util::make_unique<PseudoMethodAccent>();
86 break;
87 case Method::kBidi:
88 impl_ = util::make_unique<PseudoMethodBidi>();
89 break;
90 }
91 }
92
Text(StringPiece text)93 std::string Pseudolocalizer::Text(StringPiece text) {
94 std::string out;
95 size_t depth = last_depth_;
96 size_t lastpos, pos;
97 const size_t length = text.size();
98 const char* str = text.data();
99 bool escaped = false;
100 for (lastpos = pos = 0; pos < length; pos++) {
101 char16_t c = str[pos];
102 if (escaped) {
103 escaped = false;
104 continue;
105 }
106 if (c == '\'') {
107 escaped = true;
108 continue;
109 }
110
111 if (c == kArgStart) {
112 depth++;
113 } else if (c == kArgEnd && depth) {
114 depth--;
115 }
116
117 if (last_depth_ != depth || pos == length - 1) {
118 bool pseudo = ((last_depth_ % 2) == 0);
119 size_t nextpos = pos;
120 if (!pseudo || depth == last_depth_) {
121 nextpos++;
122 }
123 size_t size = nextpos - lastpos;
124 if (size) {
125 std::string chunk(text.substr(lastpos, size));
126 if (pseudo) {
127 chunk = impl_->Text(chunk);
128 } else if (str[lastpos] == kArgStart && str[nextpos - 1] == kArgEnd) {
129 chunk = impl_->Placeholder(chunk);
130 }
131 out.append(chunk);
132 }
133 if (pseudo && depth < last_depth_) { // End of message
134 out.append(impl_->End());
135 } else if (!pseudo && depth > last_depth_) { // Start of message
136 out.append(impl_->Start());
137 }
138 lastpos = nextpos;
139 last_depth_ = depth;
140 }
141 }
142 return out;
143 }
144
PseudolocalizeChar(const char c)145 static const char* PseudolocalizeChar(const char c) {
146 switch (c) {
147 case 'a':
148 return "\u00e5";
149 case 'b':
150 return "\u0253";
151 case 'c':
152 return "\u00e7";
153 case 'd':
154 return "\u00f0";
155 case 'e':
156 return "\u00e9";
157 case 'f':
158 return "\u0192";
159 case 'g':
160 return "\u011d";
161 case 'h':
162 return "\u0125";
163 case 'i':
164 return "\u00ee";
165 case 'j':
166 return "\u0135";
167 case 'k':
168 return "\u0137";
169 case 'l':
170 return "\u013c";
171 case 'm':
172 return "\u1e3f";
173 case 'n':
174 return "\u00f1";
175 case 'o':
176 return "\u00f6";
177 case 'p':
178 return "\u00fe";
179 case 'q':
180 return "\u0051";
181 case 'r':
182 return "\u0155";
183 case 's':
184 return "\u0161";
185 case 't':
186 return "\u0163";
187 case 'u':
188 return "\u00fb";
189 case 'v':
190 return "\u0056";
191 case 'w':
192 return "\u0175";
193 case 'x':
194 return "\u0445";
195 case 'y':
196 return "\u00fd";
197 case 'z':
198 return "\u017e";
199 case 'A':
200 return "\u00c5";
201 case 'B':
202 return "\u03b2";
203 case 'C':
204 return "\u00c7";
205 case 'D':
206 return "\u00d0";
207 case 'E':
208 return "\u00c9";
209 case 'G':
210 return "\u011c";
211 case 'H':
212 return "\u0124";
213 case 'I':
214 return "\u00ce";
215 case 'J':
216 return "\u0134";
217 case 'K':
218 return "\u0136";
219 case 'L':
220 return "\u013b";
221 case 'M':
222 return "\u1e3e";
223 case 'N':
224 return "\u00d1";
225 case 'O':
226 return "\u00d6";
227 case 'P':
228 return "\u00de";
229 case 'Q':
230 return "\u0071";
231 case 'R':
232 return "\u0154";
233 case 'S':
234 return "\u0160";
235 case 'T':
236 return "\u0162";
237 case 'U':
238 return "\u00db";
239 case 'V':
240 return "\u03bd";
241 case 'W':
242 return "\u0174";
243 case 'X':
244 return "\u00d7";
245 case 'Y':
246 return "\u00dd";
247 case 'Z':
248 return "\u017d";
249 case '!':
250 return "\u00a1";
251 case '?':
252 return "\u00bf";
253 case '$':
254 return "\u20ac";
255 default:
256 return nullptr;
257 }
258 }
259
IsPossibleNormalPlaceholderEnd(const char c)260 static bool IsPossibleNormalPlaceholderEnd(const char c) {
261 switch (c) {
262 case 's':
263 return true;
264 case 'S':
265 return true;
266 case 'c':
267 return true;
268 case 'C':
269 return true;
270 case 'd':
271 return true;
272 case 'o':
273 return true;
274 case 'x':
275 return true;
276 case 'X':
277 return true;
278 case 'f':
279 return true;
280 case 'e':
281 return true;
282 case 'E':
283 return true;
284 case 'g':
285 return true;
286 case 'G':
287 return true;
288 case 'a':
289 return true;
290 case 'A':
291 return true;
292 case 'b':
293 return true;
294 case 'B':
295 return true;
296 case 'h':
297 return true;
298 case 'H':
299 return true;
300 case '%':
301 return true;
302 case 'n':
303 return true;
304 default:
305 return false;
306 }
307 }
308
PseudoGenerateExpansion(const unsigned int length)309 static std::string PseudoGenerateExpansion(const unsigned int length) {
310 std::string result(kExpansionString);
311 if (result.size() < length) {
312 result += " ";
313 result += PseudoGenerateExpansion(length - result.size());
314 } else {
315 int ext = 0;
316 // Should contain only whole words, so looking for a space
317 {
318 const char* const s = result.data();
319 for (unsigned int i = length + 1; i < result.size(); ++i) {
320 ++ext;
321 if (s[i] == ' ') {
322 break;
323 }
324 }
325 }
326 result.resize(length + ext);
327 }
328 return result;
329 }
330
Start()331 std::string PseudoMethodAccent::Start() {
332 std::string result;
333 if (depth_ == 0) {
334 result = "[";
335 }
336 word_count_ = length_ = 0;
337 depth_++;
338 return result;
339 }
340
End()341 std::string PseudoMethodAccent::End() {
342 std::string result;
343 if (length_) {
344 result += " ";
345 result += PseudoGenerateExpansion(word_count_ > 3 ? length_ : length_ / 2);
346 }
347 word_count_ = length_ = 0;
348 depth_--;
349 if (depth_ == 0) {
350 result += "]";
351 }
352 return result;
353 }
354
355 /**
356 * Converts characters so they look like they've been localized.
357 *
358 * Note: This leaves placeholder syntax untouched.
359 */
Text(StringPiece source)360 std::string PseudoMethodAccent::Text(StringPiece source) {
361 const char* s = source.data();
362 std::string result;
363 const size_t I = source.size();
364 bool lastspace = true;
365 for (size_t i = 0; i < I; i++) {
366 char c = s[i];
367 if (c == '%') {
368 // Placeholder syntax, no need to pseudolocalize
369 std::string chunk;
370 bool end = false;
371 chunk.append(&c, 1);
372 while (!end && i + 1 < I) {
373 ++i;
374 c = s[i];
375 chunk.append(&c, 1);
376 if (IsPossibleNormalPlaceholderEnd(c)) {
377 end = true;
378 } else if (i + 1 < I && c == 't') {
379 ++i;
380 c = s[i];
381 chunk.append(&c, 1);
382 end = true;
383 }
384 }
385 // Treat chunk as a placeholder unless it ends with %.
386 result += ((c == '%') ? chunk : Placeholder(chunk));
387 } else if (c == '<' || c == '&') {
388 // html syntax, no need to pseudolocalize
389 bool tag_closed = false;
390 while (!tag_closed && i < I) {
391 if (c == '&') {
392 std::string escape_text;
393 escape_text.append(&c, 1);
394 bool end = false;
395 size_t html_code_pos = i;
396 while (!end && html_code_pos < I) {
397 ++html_code_pos;
398 c = s[html_code_pos];
399 escape_text.append(&c, 1);
400 // Valid html code
401 if (c == ';') {
402 end = true;
403 i = html_code_pos;
404 }
405 // Wrong html code
406 else if (!((c == '#' || (c >= 'a' && c <= 'z') ||
407 (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')))) {
408 end = true;
409 }
410 }
411 result += escape_text;
412 if (escape_text != "<") {
413 tag_closed = true;
414 }
415 continue;
416 }
417 if (c == '>') {
418 tag_closed = true;
419 result.append(&c, 1);
420 continue;
421 }
422 result.append(&c, 1);
423 i++;
424 c = s[i];
425 }
426 } else {
427 // This is a pure text that should be pseudolocalized
428 const char* p = PseudolocalizeChar(c);
429 if (p != nullptr) {
430 result += p;
431 } else {
432 bool space = isspace(c);
433 if (lastspace && !space) {
434 word_count_++;
435 }
436 lastspace = space;
437 result.append(&c, 1);
438 }
439 // Count only pseudolocalizable chars and delimiters
440 length_++;
441 }
442 }
443 return result;
444 }
445
Placeholder(StringPiece source)446 std::string PseudoMethodAccent::Placeholder(StringPiece source) {
447 // Surround a placeholder with brackets
448 return (std::string(kPlaceholderOpen) += source) += kPlaceholderClose;
449 }
450
Text(StringPiece source)451 std::string PseudoMethodBidi::Text(StringPiece source) {
452 const char* s = source.data();
453 std::string result;
454 bool lastspace = true;
455 bool space = true;
456 bool escape = false;
457 const char ESCAPE_CHAR = '\\';
458 for (size_t i = 0; i < source.size(); i++) {
459 char c = s[i];
460 if (!escape && c == ESCAPE_CHAR) {
461 escape = true;
462 continue;
463 }
464 space = (!escape && isspace(c)) || (escape && (c == 'n' || c == 't'));
465 if (lastspace && !space) {
466 // Word start
467 (result += kRlm) += kRlo;
468 } else if (!lastspace && space) {
469 // Word end
470 (result += kPdf) += kRlm;
471 }
472 lastspace = space;
473 if (escape) {
474 result.append(&ESCAPE_CHAR, 1);
475 escape=false;
476 }
477 result.append(&c, 1);
478 }
479 if (!lastspace) {
480 // End of last word
481 (result += kPdf) += kRlm;
482 }
483 return result;
484 }
485
Placeholder(StringPiece source)486 std::string PseudoMethodBidi::Placeholder(StringPiece source) {
487 // Surround a placeholder with directionality change sequence
488 return (((std::string(kRlm) += kRlo) += source) += kPdf) += kRlm;
489 }
490
491 } // namespace aapt
492