1 /* 2 * Copyright (C) 2016 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <array> 18 #include <cstdint> 19 #include <cstdlib> 20 #include <cstring> 21 #include <string> 22 #include <unordered_map> 23 #include <unordered_set> 24 25 #include <androidfw/LocaleData.h> 26 27 namespace android { 28 29 #include "LocaleDataTables.cpp" 30 31 inline uint32_t packLocale(const char* language, const char* region) { 32 return (((uint8_t) language[0]) << 24u) | (((uint8_t) language[1]) << 16u) | 33 (((uint8_t) region[0]) << 8u) | ((uint8_t) region[1]); 34 } 35 36 inline uint32_t dropRegion(uint32_t packed_locale) { 37 return packed_locale & 0xFFFF0000LU; 38 } 39 40 inline bool hasRegion(uint32_t packed_locale) { 41 return (packed_locale & 0x0000FFFFLU) != 0; 42 } 43 44 const size_t SCRIPT_LENGTH = 4; 45 const size_t SCRIPT_PARENTS_COUNT = sizeof(SCRIPT_PARENTS)/sizeof(SCRIPT_PARENTS[0]); 46 const uint32_t PACKED_ROOT = 0; // to represent the root locale 47 48 uint32_t findParent(uint32_t packed_locale, const char* script) { 49 if (hasRegion(packed_locale)) { 50 for (size_t i = 0; i < SCRIPT_PARENTS_COUNT; i++) { 51 if (memcmp(script, SCRIPT_PARENTS[i].script, SCRIPT_LENGTH) == 0) { 52 auto map = SCRIPT_PARENTS[i].map; 53 auto lookup_result = map->find(packed_locale); 54 if (lookup_result != map->end()) { 55 return lookup_result->second; 56 } 57 break; 58 } 59 } 60 return dropRegion(packed_locale); 61 } 62 return PACKED_ROOT; 63 } 64 65 // Find the ancestors of a locale, and fill 'out' with it (assumes out has enough 66 // space). If any of the members of stop_list was seen, write it in the 67 // output but stop afterwards. 68 // 69 // This also outputs the index of the last written ancestor in the stop_list 70 // to stop_list_index, which will be -1 if it is not found in the stop_list. 71 // 72 // Returns the number of ancestors written in the output, which is always 73 // at least one. 74 // 75 // (If 'out' is nullptr, we do everything the same way but we simply don't write 76 // any results in 'out'.) 77 size_t findAncestors(uint32_t* out, ssize_t* stop_list_index, 78 uint32_t packed_locale, const char* script, 79 const uint32_t* stop_list, size_t stop_set_length) { 80 uint32_t ancestor = packed_locale; 81 size_t count = 0; 82 do { 83 if (out != nullptr) out[count] = ancestor; 84 count++; 85 for (size_t i = 0; i < stop_set_length; i++) { 86 if (stop_list[i] == ancestor) { 87 *stop_list_index = (ssize_t) i; 88 return count; 89 } 90 } 91 ancestor = findParent(ancestor, script); 92 } while (ancestor != PACKED_ROOT); 93 *stop_list_index = (ssize_t) -1; 94 return count; 95 } 96 97 size_t findDistance(uint32_t supported, 98 const char* script, 99 const uint32_t* request_ancestors, 100 size_t request_ancestors_count) { 101 ssize_t request_ancestors_index; 102 const size_t supported_ancestor_count = findAncestors( 103 nullptr, &request_ancestors_index, 104 supported, script, 105 request_ancestors, request_ancestors_count); 106 // Since both locales share the same root, there will always be a shared 107 // ancestor, so the distance in the parent tree is the sum of the distance 108 // of 'supported' to the lowest common ancestor (number of ancestors 109 // written for 'supported' minus 1) plus the distance of 'request' to the 110 // lowest common ancestor (the index of the ancestor in request_ancestors). 111 return supported_ancestor_count + request_ancestors_index - 1; 112 } 113 114 inline bool isRepresentative(uint32_t language_and_region, const char* script) { 115 const uint64_t packed_locale = ( 116 (((uint64_t) language_and_region) << 32u) | 117 (((uint64_t) script[0]) << 24u) | 118 (((uint64_t) script[1]) << 16u) | 119 (((uint64_t) script[2]) << 8u) | 120 ((uint64_t) script[3])); 121 122 return (REPRESENTATIVE_LOCALES.count(packed_locale) != 0); 123 } 124 125 const uint32_t US_SPANISH = 0x65735553LU; // es-US 126 const uint32_t MEXICAN_SPANISH = 0x65734D58LU; // es-MX 127 const uint32_t LATIN_AMERICAN_SPANISH = 0x6573A424LU; // es-419 128 129 // The two locales es-US and es-MX are treated as special fallbacks for es-419. 130 // If there is no es-419, they are considered its equivalent. 131 inline bool isSpecialSpanish(uint32_t language_and_region) { 132 return (language_and_region == US_SPANISH || language_and_region == MEXICAN_SPANISH); 133 } 134 135 int localeDataCompareRegions( 136 const char* left_region, const char* right_region, 137 const char* requested_language, const char* requested_script, 138 const char* requested_region) { 139 140 if (left_region[0] == right_region[0] && left_region[1] == right_region[1]) { 141 return 0; 142 } 143 uint32_t left = packLocale(requested_language, left_region); 144 uint32_t right = packLocale(requested_language, right_region); 145 const uint32_t request = packLocale(requested_language, requested_region); 146 147 // If one and only one of the two locales is a special Spanish locale, we 148 // replace it with es-419. We don't do the replacement if the other locale 149 // is already es-419, or both locales are special Spanish locales (when 150 // es-US is being compared to es-MX). 151 const bool leftIsSpecialSpanish = isSpecialSpanish(left); 152 const bool rightIsSpecialSpanish = isSpecialSpanish(right); 153 if (leftIsSpecialSpanish && !rightIsSpecialSpanish && right != LATIN_AMERICAN_SPANISH) { 154 left = LATIN_AMERICAN_SPANISH; 155 } else if (rightIsSpecialSpanish && !leftIsSpecialSpanish && left != LATIN_AMERICAN_SPANISH) { 156 right = LATIN_AMERICAN_SPANISH; 157 } 158 159 uint32_t request_ancestors[MAX_PARENT_DEPTH+1]; 160 ssize_t left_right_index; 161 // Find the parents of the request, but stop as soon as we saw left or right 162 const std::array<uint32_t, 2> left_and_right = {{left, right}}; 163 const size_t ancestor_count = findAncestors( 164 request_ancestors, &left_right_index, 165 request, requested_script, 166 left_and_right.data(), left_and_right.size()); 167 if (left_right_index == 0) { // We saw left earlier 168 return 1; 169 } 170 if (left_right_index == 1) { // We saw right earlier 171 return -1; 172 } 173 174 // If we are here, neither left nor right are an ancestor of the 175 // request. This means that all the ancestors have been computed and 176 // the last ancestor is just the language by itself. We will use the 177 // distance in the parent tree for determining the better match. 178 const size_t left_distance = findDistance( 179 left, requested_script, request_ancestors, ancestor_count); 180 const size_t right_distance = findDistance( 181 right, requested_script, request_ancestors, ancestor_count); 182 if (left_distance != right_distance) { 183 return (int) right_distance - (int) left_distance; // smaller distance is better 184 } 185 186 // If we are here, left and right are equidistant from the request. We will 187 // try and see if any of them is a representative locale. 188 const bool left_is_representative = isRepresentative(left, requested_script); 189 const bool right_is_representative = isRepresentative(right, requested_script); 190 if (left_is_representative != right_is_representative) { 191 return (int) left_is_representative - (int) right_is_representative; 192 } 193 194 // We have no way of figuring out which locale is a better match. For 195 // the sake of stability, we consider the locale with the lower region 196 // code (in dictionary order) better, with two-letter codes before 197 // three-digit codes (since two-letter codes are more specific). 198 return (int64_t) right - (int64_t) left; 199 } 200 201 void localeDataComputeScript(char out[4], const char* language, const char* region) { 202 if (language[0] == '\0') { 203 memset(out, '\0', SCRIPT_LENGTH); 204 return; 205 } 206 uint32_t lookup_key = packLocale(language, region); 207 auto lookup_result = LIKELY_SCRIPTS.find(lookup_key); 208 if (lookup_result == LIKELY_SCRIPTS.end()) { 209 // We couldn't find the locale. Let's try without the region 210 if (region[0] != '\0') { 211 lookup_key = dropRegion(lookup_key); 212 lookup_result = LIKELY_SCRIPTS.find(lookup_key); 213 if (lookup_result != LIKELY_SCRIPTS.end()) { 214 memcpy(out, SCRIPT_CODES[lookup_result->second], SCRIPT_LENGTH); 215 return; 216 } 217 } 218 // We don't know anything about the locale 219 memset(out, '\0', SCRIPT_LENGTH); 220 return; 221 } else { 222 // We found the locale. 223 memcpy(out, SCRIPT_CODES[lookup_result->second], SCRIPT_LENGTH); 224 } 225 } 226 227 const uint32_t ENGLISH_STOP_LIST[2] = { 228 0x656E0000LU, // en 229 0x656E8400LU, // en-001 230 }; 231 const char ENGLISH_CHARS[2] = {'e', 'n'}; 232 const char LATIN_CHARS[4] = {'L', 'a', 't', 'n'}; 233 234 bool localeDataIsCloseToUsEnglish(const char* region) { 235 const uint32_t locale = packLocale(ENGLISH_CHARS, region); 236 ssize_t stop_list_index; 237 findAncestors(nullptr, &stop_list_index, locale, LATIN_CHARS, ENGLISH_STOP_LIST, 2); 238 // A locale is like US English if we see "en" before "en-001" in its ancestor list. 239 return stop_list_index == 0; // 'en' is first in ENGLISH_STOP_LIST 240 } 241 242 } // namespace android 243