1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "compile/Pseudolocalizer.h"
18 
19 #include "util/Util.h"
20 
21 using android::StringPiece;
22 
23 using namespace std::literals;
24 
25 namespace aapt {
26 
27 // String basis to generate expansion
28 static constexpr auto kExpansionString =
29     "one two three "
30     "four five six seven eight nine ten eleven twelve thirteen "
31     "fourteen fiveteen sixteen seventeen nineteen twenty"sv;
32 
33 // Special unicode characters to override directionality of the words
34 static constexpr auto kRlm = "\u200f"sv;
35 static constexpr auto kRlo = "\u202e"sv;
36 static constexpr auto kPdf = "\u202c"sv;
37 
38 // Placeholder marks
39 static constexpr auto kPlaceholderOpen = "\u00bb"sv;
40 static constexpr auto kPlaceholderClose = "\u00ab"sv;
41 
42 static const char kArgStart = '{';
43 static const char kArgEnd = '}';
44 
45 class PseudoMethodNone : public PseudoMethodImpl {
46  public:
Text(StringPiece text)47   std::string Text(StringPiece text) override {
48     return std::string(text);
49   }
Placeholder(StringPiece text)50   std::string Placeholder(StringPiece text) override {
51     return std::string(text);
52   }
53 };
54 
55 class PseudoMethodBidi : public PseudoMethodImpl {
56  public:
57   std::string Text(StringPiece text) override;
58   std::string Placeholder(StringPiece text) override;
59 };
60 
61 class PseudoMethodAccent : public PseudoMethodImpl {
62  public:
PseudoMethodAccent()63   PseudoMethodAccent() : depth_(0), word_count_(0), length_(0) {}
64   std::string Start() override;
65   std::string End() override;
66   std::string Text(StringPiece text) override;
67   std::string Placeholder(StringPiece text) override;
68 
69  private:
70   size_t depth_;
71   size_t word_count_;
72   size_t length_;
73 };
74 
Pseudolocalizer(Method method)75 Pseudolocalizer::Pseudolocalizer(Method method) : last_depth_(0) {
76   SetMethod(method);
77 }
78 
SetMethod(Method method)79 void Pseudolocalizer::SetMethod(Method method) {
80   switch (method) {
81     case Method::kNone:
82       impl_ = util::make_unique<PseudoMethodNone>();
83       break;
84     case Method::kAccent:
85       impl_ = util::make_unique<PseudoMethodAccent>();
86       break;
87     case Method::kBidi:
88       impl_ = util::make_unique<PseudoMethodBidi>();
89       break;
90   }
91 }
92 
Text(StringPiece text)93 std::string Pseudolocalizer::Text(StringPiece text) {
94   std::string out;
95   size_t depth = last_depth_;
96   size_t lastpos, pos;
97   const size_t length = text.size();
98   const char* str = text.data();
99   bool escaped = false;
100   for (lastpos = pos = 0; pos < length; pos++) {
101     char16_t c = str[pos];
102     if (escaped) {
103       escaped = false;
104       continue;
105     }
106     if (c == '\'') {
107       escaped = true;
108       continue;
109     }
110 
111     if (c == kArgStart) {
112       depth++;
113     } else if (c == kArgEnd && depth) {
114       depth--;
115     }
116 
117     if (last_depth_ != depth || pos == length - 1) {
118       bool pseudo = ((last_depth_ % 2) == 0);
119       size_t nextpos = pos;
120       if (!pseudo || depth == last_depth_) {
121         nextpos++;
122       }
123       size_t size = nextpos - lastpos;
124       if (size) {
125         std::string chunk(text.substr(lastpos, size));
126         if (pseudo) {
127           chunk = impl_->Text(chunk);
128         } else if (str[lastpos] == kArgStart && str[nextpos - 1] == kArgEnd) {
129           chunk = impl_->Placeholder(chunk);
130         }
131         out.append(chunk);
132       }
133       if (pseudo && depth < last_depth_) {  // End of message
134         out.append(impl_->End());
135       } else if (!pseudo && depth > last_depth_) {  // Start of message
136         out.append(impl_->Start());
137       }
138       lastpos = nextpos;
139       last_depth_ = depth;
140     }
141   }
142   return out;
143 }
144 
PseudolocalizeChar(const char c)145 static const char* PseudolocalizeChar(const char c) {
146   switch (c) {
147     case 'a':
148       return "\u00e5";
149     case 'b':
150       return "\u0253";
151     case 'c':
152       return "\u00e7";
153     case 'd':
154       return "\u00f0";
155     case 'e':
156       return "\u00e9";
157     case 'f':
158       return "\u0192";
159     case 'g':
160       return "\u011d";
161     case 'h':
162       return "\u0125";
163     case 'i':
164       return "\u00ee";
165     case 'j':
166       return "\u0135";
167     case 'k':
168       return "\u0137";
169     case 'l':
170       return "\u013c";
171     case 'm':
172       return "\u1e3f";
173     case 'n':
174       return "\u00f1";
175     case 'o':
176       return "\u00f6";
177     case 'p':
178       return "\u00fe";
179     case 'q':
180       return "\u0051";
181     case 'r':
182       return "\u0155";
183     case 's':
184       return "\u0161";
185     case 't':
186       return "\u0163";
187     case 'u':
188       return "\u00fb";
189     case 'v':
190       return "\u0056";
191     case 'w':
192       return "\u0175";
193     case 'x':
194       return "\u0445";
195     case 'y':
196       return "\u00fd";
197     case 'z':
198       return "\u017e";
199     case 'A':
200       return "\u00c5";
201     case 'B':
202       return "\u03b2";
203     case 'C':
204       return "\u00c7";
205     case 'D':
206       return "\u00d0";
207     case 'E':
208       return "\u00c9";
209     case 'G':
210       return "\u011c";
211     case 'H':
212       return "\u0124";
213     case 'I':
214       return "\u00ce";
215     case 'J':
216       return "\u0134";
217     case 'K':
218       return "\u0136";
219     case 'L':
220       return "\u013b";
221     case 'M':
222       return "\u1e3e";
223     case 'N':
224       return "\u00d1";
225     case 'O':
226       return "\u00d6";
227     case 'P':
228       return "\u00de";
229     case 'Q':
230       return "\u0071";
231     case 'R':
232       return "\u0154";
233     case 'S':
234       return "\u0160";
235     case 'T':
236       return "\u0162";
237     case 'U':
238       return "\u00db";
239     case 'V':
240       return "\u03bd";
241     case 'W':
242       return "\u0174";
243     case 'X':
244       return "\u00d7";
245     case 'Y':
246       return "\u00dd";
247     case 'Z':
248       return "\u017d";
249     case '!':
250       return "\u00a1";
251     case '?':
252       return "\u00bf";
253     case '$':
254       return "\u20ac";
255     default:
256       return nullptr;
257   }
258 }
259 
IsPossibleNormalPlaceholderEnd(const char c)260 static bool IsPossibleNormalPlaceholderEnd(const char c) {
261   switch (c) {
262     case 's':
263       return true;
264     case 'S':
265       return true;
266     case 'c':
267       return true;
268     case 'C':
269       return true;
270     case 'd':
271       return true;
272     case 'o':
273       return true;
274     case 'x':
275       return true;
276     case 'X':
277       return true;
278     case 'f':
279       return true;
280     case 'e':
281       return true;
282     case 'E':
283       return true;
284     case 'g':
285       return true;
286     case 'G':
287       return true;
288     case 'a':
289       return true;
290     case 'A':
291       return true;
292     case 'b':
293       return true;
294     case 'B':
295       return true;
296     case 'h':
297       return true;
298     case 'H':
299       return true;
300     case '%':
301       return true;
302     case 'n':
303       return true;
304     default:
305       return false;
306   }
307 }
308 
PseudoGenerateExpansion(const unsigned int length)309 static std::string PseudoGenerateExpansion(const unsigned int length) {
310   std::string result(kExpansionString);
311   if (result.size() < length) {
312     result += " ";
313     result += PseudoGenerateExpansion(length - result.size());
314   } else {
315     int ext = 0;
316     // Should contain only whole words, so looking for a space
317     {
318       const char* const s = result.data();
319       for (unsigned int i = length + 1; i < result.size(); ++i) {
320         ++ext;
321         if (s[i] == ' ') {
322           break;
323         }
324       }
325     }
326     result.resize(length + ext);
327   }
328   return result;
329 }
330 
Start()331 std::string PseudoMethodAccent::Start() {
332   std::string result;
333   if (depth_ == 0) {
334     result = "[";
335   }
336   word_count_ = length_ = 0;
337   depth_++;
338   return result;
339 }
340 
End()341 std::string PseudoMethodAccent::End() {
342   std::string result;
343   if (length_) {
344     result += " ";
345     result += PseudoGenerateExpansion(word_count_ > 3 ? length_ : length_ / 2);
346   }
347   word_count_ = length_ = 0;
348   depth_--;
349   if (depth_ == 0) {
350     result += "]";
351   }
352   return result;
353 }
354 
355 /**
356  * Converts characters so they look like they've been localized.
357  *
358  * Note: This leaves placeholder syntax untouched.
359  */
Text(StringPiece source)360 std::string PseudoMethodAccent::Text(StringPiece source) {
361   const char* s = source.data();
362   std::string result;
363   const size_t I = source.size();
364   bool lastspace = true;
365   for (size_t i = 0; i < I; i++) {
366     char c = s[i];
367     if (c == '%') {
368       // Placeholder syntax, no need to pseudolocalize
369       std::string chunk;
370       bool end = false;
371       chunk.append(&c, 1);
372       while (!end && i + 1 < I) {
373         ++i;
374         c = s[i];
375         chunk.append(&c, 1);
376         if (IsPossibleNormalPlaceholderEnd(c)) {
377           end = true;
378         } else if (i + 1 < I && c == 't') {
379           ++i;
380           c = s[i];
381           chunk.append(&c, 1);
382           end = true;
383         }
384       }
385       // Treat chunk as a placeholder unless it ends with %.
386       result += ((c == '%') ? chunk : Placeholder(chunk));
387     } else if (c == '<' || c == '&') {
388       // html syntax, no need to pseudolocalize
389       bool tag_closed = false;
390       while (!tag_closed && i < I) {
391         if (c == '&') {
392           std::string escape_text;
393           escape_text.append(&c, 1);
394           bool end = false;
395           size_t html_code_pos = i;
396           while (!end && html_code_pos < I) {
397             ++html_code_pos;
398             c = s[html_code_pos];
399             escape_text.append(&c, 1);
400             // Valid html code
401             if (c == ';') {
402               end = true;
403               i = html_code_pos;
404             }
405             // Wrong html code
406             else if (!((c == '#' || (c >= 'a' && c <= 'z') ||
407                         (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')))) {
408               end = true;
409             }
410           }
411           result += escape_text;
412           if (escape_text != "&lt;") {
413             tag_closed = true;
414           }
415           continue;
416         }
417         if (c == '>') {
418           tag_closed = true;
419           result.append(&c, 1);
420           continue;
421         }
422         result.append(&c, 1);
423         i++;
424         c = s[i];
425       }
426     } else {
427       // This is a pure text that should be pseudolocalized
428       const char* p = PseudolocalizeChar(c);
429       if (p != nullptr) {
430         result += p;
431       } else {
432         bool space = isspace(c);
433         if (lastspace && !space) {
434           word_count_++;
435         }
436         lastspace = space;
437         result.append(&c, 1);
438       }
439       // Count only pseudolocalizable chars and delimiters
440       length_++;
441     }
442   }
443   return result;
444 }
445 
Placeholder(StringPiece source)446 std::string PseudoMethodAccent::Placeholder(StringPiece source) {
447   // Surround a placeholder with brackets
448   return (std::string(kPlaceholderOpen) += source) += kPlaceholderClose;
449 }
450 
Text(StringPiece source)451 std::string PseudoMethodBidi::Text(StringPiece source) {
452   const char* s = source.data();
453   std::string result;
454   bool lastspace = true;
455   bool space = true;
456   bool escape = false;
457   const char ESCAPE_CHAR = '\\';
458   for (size_t i = 0; i < source.size(); i++) {
459     char c = s[i];
460     if (!escape && c == ESCAPE_CHAR) {
461       escape = true;
462       continue;
463     }
464     space = (!escape && isspace(c)) || (escape && (c == 'n' || c == 't'));
465     if (lastspace && !space) {
466       // Word start
467       (result += kRlm) += kRlo;
468     } else if (!lastspace && space) {
469       // Word end
470       (result += kPdf) += kRlm;
471     }
472     lastspace = space;
473     if (escape) {
474       result.append(&ESCAPE_CHAR, 1);
475       escape=false;
476     }
477     result.append(&c, 1);
478   }
479   if (!lastspace) {
480     // End of last word
481     (result += kPdf) += kRlm;
482   }
483   return result;
484 }
485 
Placeholder(StringPiece source)486 std::string PseudoMethodBidi::Placeholder(StringPiece source) {
487   // Surround a placeholder with directionality change sequence
488   return (((std::string(kRlm) += kRlo) += source) += kPdf) += kRlm;
489 }
490 
491 }  // namespace aapt
492