1 package org.unicode.cldr.json;
2 
3 import java.util.HashSet;
4 import java.util.Set;
5 import java.util.regex.Pattern;
6 
7 import org.unicode.cldr.util.Builder;
8 import org.unicode.cldr.util.CLDRFile;
9 import org.unicode.cldr.util.PatternCache;
10 
11 import com.google.common.collect.ImmutableSet;
12 
13 class LdmlConvertRules {
14 
15     /** File sets that will not be processed in JSON transformation. */
16     public static final ImmutableSet<String> IGNORE_FILE_SET = ImmutableSet.of("attributeValueValidity", "coverageLevels", "postalCodeData", "pluralRanges",
17         "subdivisions");
18 
19     /**
20      * The attribute list that should become part of the name in form of
21      * name-(attribute)-(value).
22      * [parent_element]:[element]:[attribute]
23      */
24     // common/main
25     static final ImmutableSet<String> NAME_PART_DISTINGUISHING_ATTR_SET = ImmutableSet.of(
26         "monthWidth:month:yeartype",
27         "characters:parseLenients:scope",
28         "dateFormat:pattern:numbers",
29         "currencyFormats:unitPattern:count",
30         "currency:displayName:count",
31         "numbers:symbols:numberSystem",
32         "numbers:decimalFormats:numberSystem",
33         "numbers:currencyFormats:numberSystem",
34         "numbers:percentFormats:numberSystem",
35         "numbers:scientificFormats:numberSystem",
36         "numbers:miscPatterns:numberSystem",
37         "minimalPairs:pluralMinimalPairs:count",
38         "territoryContainment:group:status",
39         "decimalFormat:pattern:count",
40         "currencyFormat:pattern:count",
41         "unit:unitPattern:count",
42         "field:relative:type",
43         "field:relativeTime:type",
44         "relativeTime:relativeTimePattern:count",
45         "availableFormats:dateFormatItem:count",
46         "listPatterns:listPattern:type",
47         "timeZoneNames:regionFormat:type",
48         "units:durationUnit:type",
49         "weekData:minDays:territories",
50         "weekData:firstDay:territories",
51         "weekData:weekendStart:territories",
52         "weekData:weekendEnd:territories",
53         "unitPreferenceDataData:unitPreferences:category",
54         "measurementData:measurementSystem:category",
55         "supplemental:plurals:type",
56         "pluralRules:pluralRule:count",
57         "languageMatches:languageMatch:desired");
58 
59     /**
60      * The set of attributes that should become part of the name in form of
61      * name-(attribute)-(value).
62      */
63 
64     /**
65      * Following is a list of element:attribute pair. These attributes should be
66      * treated as values. For example,
67      * <type type="arab" key="numbers">Arabic-Indic Digits</type>
68      * should be really converted as,
69      * "arab": {
70      * "_value": "Arabic-Indic Digits",
71      * "_key": "numbers"
72      * }
73      */
74     static final ImmutableSet<String> ATTR_AS_VALUE_SET = ImmutableSet.of(
75 
76         // in common/supplemental/dayPeriods.xml
77         "dayPeriodRules:dayPeriodRule:from",
78 
79         // in common/supplemental/likelySubtags.xml
80         "likelySubtags:likelySubtag:to",
81 
82         // in common/supplemental/metaZones.xml
83         "timezone:usesMetazone:mzone",
84         // Only the current usesMetazone will be kept, it is not necessary to keep
85         // "to" and "from" attributes to make key unique. This is needed as their
86         // value is not good if used as key.
87         "timezone:usesMetazone:to",
88         "timezone:usesMetazone:from",
89 
90         "mapTimezones:mapZone:other",
91         "mapTimezones:mapZone:type",
92         "mapTimezones:mapZone:territory",
93 
94         // in common/supplemental/numberingSystems.xml
95         "numberingSystems:numberingSystem:type",
96 
97         // in common/supplemental/supplementalData.xml
98         "region:currency:from",
99         "region:currency:to",
100         "region:currency:tender",
101         "calendar:calendarSystem:type",
102         "codeMappings:territoryCodes:numeric",
103         "codeMappings:territoryCodes:alpha3",
104         "codeMappings:currencyCodes:numeric",
105         "timeData:hours:allowed",
106         "timeData:hours:preferred",
107         // common/supplemental/supplementalMetaData.xml
108         "validity:variable:type",
109         "deprecated:deprecatedItems:elements",
110         "deprecated:deprecatedItems:attributes",
111         "deprecated:deprecatedItems:type",
112 
113         // in common/supplemental/telephoneCodeData.xml
114         "codesByTerritory:telephoneCountryCode:code",
115 
116         // in common/supplemental/windowsZones.xml
117         "mapTimezones:mapZone:other",
118 
119         // in common/bcp47/*.xml
120         "keyword:key:alias",
121         "key:type:alias",
122         "key:type:name",
123 
124         // identity elements
125         "identity:language:type",
126         "identity:script:type",
127         "identity:territory:type",
128         "identity:variant:type");
129 
130     /**
131      * The set of element:attribute pair in which the attribute should be
132      * treated as value. All the attribute here are non-distinguishing attributes.
133      */
134 
135     /**
136      * For those attributes that are treated as values, they taken the form of
137      * element_name: { ..., attribute: value, ...}
138      * This is desirable as an element may have several attributes that are
139      * treated as values. But in some cases, there is one such attribute only,
140      * and it is more desirable to convert
141      * element_name: { attribute: value}
142      * to
143      * element_name: value
144      * With a solid example,
145      * <likelySubtag from="zh" to="zh_Hans_CN" />
146      * distinguishing attr "from" will become the key, its better to
147      * omit "to" and have this simple mapping:
148      * "zh" : "zh_Hans_CN",
149      */
150     static final ImmutableSet<String> COMPACTABLE_ATTR_AS_VALUE_SET = ImmutableSet.of(
151         // common/main
152         "calendars:default:choice",
153         "dateFormats:default:choice",
154         "months:default:choice",
155         "monthContext:default:choice",
156         "days:default:choice",
157         "dayContext:default:choice",
158         "timeFormats:default:choice",
159         "dateTimeFormats:default:choice",
160         "timeZoneNames:singleCountries:list",
161 
162         //rbnf
163         "ruleset:rbnfrule:value",
164         // common/supplemental
165         "likelySubtags:likelySubtag:to",
166         //"territoryContainment:group:type",
167         "calendar:calendarSystem:type",
168         "calendarPreferenceData:calendarPreference:ordering",
169         "codesByTerritory:telephoneCountryCode:code",
170 
171         // common/collation
172         "collations:default:choice",
173 
174         // identity elements
175         "identity:language:type",
176         "identity:script:type",
177         "identity:territory:type",
178         "identity:variant:type");
179 
180     /**
181      * The set of attributes that should be treated as value, and reduce to
182      * simple value only form.
183      */
184 
185     /**
186      * Anonymous key name.
187      */
188     public static final String ANONYMOUS_KEY = "_";
189 
190     /**
191      * Check if the attribute should be suppressed.
192      *
193      * Right now only "_q" is suppressed. In most cases array is used and there
194      * is no need for this information. In other cases, order is irrelevant.
195      *
196      * @return True if the attribute should be suppressed.
197      */
IsSuppresedAttr(String attr)198     public static boolean IsSuppresedAttr(String attr) {
199         return attr.endsWith("_q") || attr.endsWith("-q");
200     }
201 
202     /**
203      * The set of attributes that should be ignored in the conversion process.
204      */
205     public static final ImmutableSet<String> IGNORABLE_NONDISTINGUISHING_ATTR_SET = ImmutableSet.of("draft", "references");
206 
207     /**
208      * List of attributes that should be suppressed.
209      * This list comes form cldr/common/supplemental/supplementalMetadata. Each
210      * three of them is a group, they are for element, value and attribute.
211      * If the specified attribute appears in specified element with specified =
212      * value, it should be suppressed.
213      */
214     public static final String[] ATTR_SUPPRESS_LIST = {
215         // common/main
216         "dateFormat", "standard", "type",
217         "dateTimeFormat", "standard", "type",
218         "timeFormat", "standard", "type",
219         "decimalFormat", "standard", "type",
220         "percentFormat", "standard", "type",
221         "scientificFormat", "standard", "type",
222         "pattern", "standard", "type",
223     };
224 
225     /**
226      * This is a simple class to hold the splittable attribute specification.
227      */
228     public static class SplittableAttributeSpec {
229         public String element;
230         public String attribute;
231         public String attrAsValueAfterSplit;
232 
SplittableAttributeSpec(String el, String attr, String av)233         SplittableAttributeSpec(String el, String attr, String av) {
234             element = el;
235             attribute = attr;
236             attrAsValueAfterSplit = av;
237         }
238     }
239 
240     /**
241      * List of attributes that has value that can be split. Each two of them is a
242      * group, and represent element and value. Occurrences of such match should
243      * lead to creation of multiple node.
244      * Example:
245      * <weekendStart day="thu" territories="DZ KW OM SA SD YE AF IR"/>
246      * should be treated as if following node is encountered.
247      * <weekendStart day="thu" territories="DZ"/>
248      * <weekendStart day="thu" territories="KW"/>
249      * <weekendStart day="thu" territories="OM"/>
250      * <weekendStart day="thu" territories="SA"/>
251      * <weekendStart day="thu" territories="SD"/>
252      * <weekendStart day="thu" territories="YE"/>
253      * <weekendStart day="thu" territories="AF"/>
254      * <weekendStart day="thu" territories="IR"/>
255      */
256     public static final SplittableAttributeSpec[] SPLITTABLE_ATTRS = {
257         new SplittableAttributeSpec("calendarPreference", "territories", null),
258         new SplittableAttributeSpec("pluralRules", "locales", null),
259         new SplittableAttributeSpec("minDays", "territories", "count"),
260         new SplittableAttributeSpec("firstDay", "territories", "day"),
261         new SplittableAttributeSpec("weekendStart", "territories", "day"),
262         new SplittableAttributeSpec("weekendEnd", "territories", "day"),
263         new SplittableAttributeSpec("measurementSystem", "territories", "type"),
264         new SplittableAttributeSpec("measurementSystem-category-temperature", "territories", "type"),
265         new SplittableAttributeSpec("paperSize", "territories", "type"),
266         new SplittableAttributeSpec("parentLocale", "locales", "parent"),
267         new SplittableAttributeSpec("hours", "regions", null),
268         new SplittableAttributeSpec("dayPeriodRules", "locales", null),
269         // new SplittableAttributeSpec("group", "contains", "group"),
270         new SplittableAttributeSpec("personList", "locales", "type"),
271         new SplittableAttributeSpec("unitPreference", "regions", null)
272     };
273 
274     /**
275      * The set that contains all timezone type of elements.
276      */
277     public static final Set<String> TIMEZONE_ELEMENT_NAME_SET = Builder.with(new HashSet<String>())
278         .add("zone").add("timezone")
279         .add("zoneItem").add("typeMap").freeze();
280 
281     /**
282      * There are a handful of attribute values that are more properly represented as an array of strings rather than
283      * as a single string.
284      */
285     public static final Set<String> ATTRVALUE_AS_ARRAY_SET = Builder.with(new HashSet<String>())
286         .add("territories").add("scripts").add("contains").freeze();
287 
288     /**
289      * Following is the list of elements that need to be sorted before output.
290      *
291      * Time zone item is split to multiple level, and each level should be
292      * grouped together. The locale list in "dayPeriodRule" could be split to
293      * multiple items, and items for each locale should be grouped together.
294      */
295     public static final String[] ELEMENT_NEED_SORT = {
296         "zone", "timezone", "zoneItem", "typeMap", "dayPeriodRule",
297         "pluralRules", "personList", "calendarPreferenceData", "character-fallback", "types", "timeData", "minDays",
298         "firstDay", "weekendStart", "weekendEnd", "measurementData", "measurementSystem"
299     };
300 
301     /**
302      * Some elements in CLDR has multiple children of the same type of element.
303      * We would like to treat them as array.
304      */
305     public static final Pattern ARRAY_ITEM_PATTERN = PatternCache.get(
306         "(.*/collation[^/]*/rules[^/]*/" +
307             "|.*/character-fallback[^/]*/character[^/]*/" +
308             "|.*/rbnfrule[^/]*/" +
309             "|.*/ruleset[^/]*/" +
310             "|.*/languageMatching[^/]*/languageMatches[^/]*/" +
311             "|.*/windowsZones[^/]*/mapTimezones[^/]*/" +
312             "|.*/metaZones[^/]*/mapTimezones[^/]*/" +
313             "|.*/segmentation[^/]*/variables[^/]*/" +
314             "|.*/segmentation[^/]*/suppressions[^/]*/" +
315             "|.*/transform[^/]*/tRules[^/]*/" +
316             "|.*/region/region[^/]*/" +
317             "|.*/keyword[^/]*/key[^/]*/" +
318             "|.*/telephoneCodeData[^/]*/codesByTerritory[^/]*/" +
319             "|.*/metazoneInfo[^/]*/timezone\\[[^\\]]*\\]/" +
320             "|.*/metadata[^/]*/validity[^/]*/" +
321             "|.*/metadata[^/]*/suppress[^/]*/" +
322             "|.*/metadata[^/]*/deprecated[^/]*/" +
323             ")(.*)");
324 
325     /**
326      * Number elements without a numbering system are there only for compatibility purposes.
327      * We automatically suppress generation of JSON objects for them.
328      */
329     public static final Pattern NO_NUMBERING_SYSTEM_PATTERN = Pattern
330         .compile("//ldml/numbers/(symbols|(decimal|percent|scientific|currency)Formats)/.*");
331     public static final Pattern NUMBERING_SYSTEM_PATTERN = Pattern
332         .compile("//ldml/numbers/(symbols|miscPatterns|(decimal|percent|scientific|currency)Formats)\\[@numberSystem=\"([^\"]++)\"\\]/.*");
333     public static final String[] ACTIVE_NUMBERING_SYSTEM_XPATHS = {
334         "//ldml/numbers/defaultNumberingSystem",
335         "//ldml/numbers/otherNumberingSystems/native",
336         "//ldml/numbers/otherNumberingSystems/traditional",
337         "//ldml/numbers/otherNumberingSystems/finance"
338     };
339 
340     /**
341      * Root language id pattern should be discarded in all locales except root,
342      * even though the path will exist in a resolved CLDRFile.
343      */
344     public static final Pattern ROOT_IDENTITY_PATTERN = Pattern
345         .compile("//ldml/identity/language\\[@type=\"root\"\\]");
346 
347     /**
348      * A simple class to hold the specification of a path transformation.
349      */
350     public static class PathTransformSpec {
351         public Pattern pattern;
352         public String replacement;
353 
PathTransformSpec(String patternStr, String replacement)354         PathTransformSpec(String patternStr, String replacement) {
355             pattern = PatternCache.get(patternStr);
356             this.replacement = replacement;
357         }
358     }
359 
360     /**
361      * Some special transformation, like add an additional layer, can be easily
362      * done by transforming the path. Following rules covers these kind of
363      * transformation.
364      * Note: It is important to keep the order for these rules. Whenever a
365      * rule matches, further rule won't be applied.
366      */
367     public static final PathTransformSpec PATH_TRANSFORMATIONS[] = {
368         // Add "standard" as type attribute to exemplarCharacter element if there
369         // is none, and separate them to two layers.
370         new PathTransformSpec(
371             "(.*ldml/exemplarCharacters)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
372         new PathTransformSpec("(.*ldml/exemplarCharacters)(.*)$", "$1/standard$2"),
373 
374         // Add cldrVersion attribute
375         new PathTransformSpec("(.*/identity/version\\[@number=\"([^\"]*)\")(\\])", "$1" + "\\]\\[@cldrVersion=\""
376             + CLDRFile.GEN_VERSION + "\"\\]"),
377         // Add cldrVersion attribute to supplemental data
378         new PathTransformSpec("(.*/version\\[@number=\"([^\"]*)\")(\\])\\[@unicodeVersion=\"([^\"]*\")(\\])", "$1" + "\\]\\[@cldrVersion=\""
379             + CLDRFile.GEN_VERSION + "\"\\]" + "\\[@unicodeVersion=\"" + "$4" + "\\]"),
380 
381         // Transform underscore to hyphen-minus in language keys
382         new PathTransformSpec("(.*/language\\[@type=\"[a-z]{2,3})_([^\"]*\"\\](\\[@alt=\"short\"])?)", "$1-$2"),
383 
384         // Separate "ellipsis" from its type as another layer.
385         new PathTransformSpec("(.*/ellipsis)\\[@type=\"([^\"]*)\"\\](.*)$",
386             "$1/$2$3"),
387 
388         // Remove unnecessary dateFormat/pattern
389         new PathTransformSpec(
390             "(.*/calendars)/calendar\\[@type=\"([^\"]*)\"\\](.*)Length\\[@type=\"([^\"]*)\"\\]/(date|time|dateTime)Format\\[@type=\"([^\"]*)\"\\]/pattern\\[@type=\"([^\"]*)\"\\](.*)",
391             "$1/$2/$5Formats/$4$8"),
392 
393         // Separate calendar type
394         new PathTransformSpec("(.*/calendars)/calendar\\[@type=\"([^\"]*)\"\\](.*)$",
395             "$1/$2$3"),
396 
397         // Separate "metazone" from its type as another layer.
398         new PathTransformSpec("(.*/metazone)\\[@type=\"([^\"]*)\"\\]/(.*)$", "$1/$2/$3"),
399 
400         // Split out types into its various fields
401         new PathTransformSpec("(.*)/types/type\\[@key=\"([^\"]*)\"\\]\\[@type=\"([^\"]*)\"\\](.*)$",
402             "$1/types/$2/$3$4"),
403 
404         new PathTransformSpec(
405             "(.*/numbers/(decimal|scientific|percent|currency)Formats\\[@numberSystem=\"([^\"]*)\"\\])/(decimal|scientific|percent|currency)FormatLength/(decimal|scientific|percent|currency)Format\\[@type=\"standard\"]/pattern.*$",
406             "$1/standard"),
407 
408         new PathTransformSpec(
409             "(.*/numbers/currencyFormats\\[@numberSystem=\"([^\"]*)\"\\])/currencyFormatLength/currencyFormat\\[@type=\"accounting\"]/pattern.*$",
410             "$1/accounting"),
411         // Add "type" attribute with value "standard" if there is no "type" in
412         // "decimalFormatLength".
413         new PathTransformSpec(
414             "(.*/numbers/(decimal|scientific|percent)Formats\\[@numberSystem=\"([^\"]*)\"\\]/(decimal|scientific|percent)FormatLength)/(.*)$",
415             "$1[@type=\"standard\"]/$5"),
416 
417         new PathTransformSpec(
418             "(.*/listPattern)/(.*)$", "$1[@type=\"standard\"]/$2"),
419 
420         new PathTransformSpec("(.*/languagePopulation)\\[@type=\"([^\"]*)\"\\](.*)",
421             "$1/$2$3"),
422 
423         new PathTransformSpec("(.*/languageAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
424         new PathTransformSpec("(.*/scriptAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
425         new PathTransformSpec("(.*/territoryAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
426         new PathTransformSpec("(.*/variantAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
427         new PathTransformSpec("(.*/zoneAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
428         new PathTransformSpec("(.*/alias)(.*)", "$1/alias$2"),
429 
430         new PathTransformSpec("(.*currencyData/region)(.*)", "$1/region$2"),
431 
432         // Skip exemplar city in /etc/GMT or UTC timezones, since they don't have them.
433         new PathTransformSpec("(.*(GMT|UTC).*/exemplarCity)(.*)", ""),
434 
435         new PathTransformSpec("(.*/transforms/transform[^/]*)/(.*)", "$1/tRules/$2"),
436         new PathTransformSpec("(.*)\\[@territories=\"([^\"]*)\"\\](.*)\\[@alt=\"variant\"\\](.*)", "$1\\[@territories=\"$2-alt-variant\"\\]"),
437         new PathTransformSpec("(.*)/weekData/(.*)\\[@alt=\"variant\"\\](.*)", "$1/weekData/$2$3"),
438         new PathTransformSpec("(.*)/unitPreferenceData/unitPreferences\\[@category=\"([^\"]*)\"\\]\\[@usage=\"([^\"]*)\"\\](.*)",
439             "$1/unitPreferenceData/unitPreferences/$2/$3$4"),
440 
441     };
442 }
443