1 package org.unicode.cldr.api;
2 
3 import static com.google.common.base.CharMatcher.anyOf;
4 import static com.google.common.base.CharMatcher.inRange;
5 import static com.google.common.base.Preconditions.checkArgument;
6 import static com.google.common.base.Preconditions.checkElementIndex;
7 import static com.google.common.base.Preconditions.checkNotNull;
8 import static com.google.common.base.Preconditions.checkState;
9 import static java.util.stream.Collectors.toCollection;
10 
11 import java.util.ArrayList;
12 import java.util.Comparator;
13 import java.util.List;
14 import java.util.Objects;
15 import java.util.Optional;
16 import java.util.stream.Collectors;
17 import java.util.stream.IntStream;
18 
19 import org.unicode.cldr.api.AttributeKey.AttributeSupplier;
20 
21 import com.google.common.base.CharMatcher;
22 import com.google.common.collect.ImmutableList;
23 import com.google.common.collect.ImmutableMap;
24 
25 /**
26  * A sequence of CLDR path elements and "distinguishing" attributes.
27  *
28  * <p>In CLDR, a path is composed of a series of elements and associated attributes, where the
29  * attributes can be one of three types; "distinguishing", "value" and "metadata".
30  *
31  * <p>CldrPath instances hold only "distinguishing" attributes, with "value" attributes being held
32  * by the associated {@link CldrValue} instance, and "metadata" attributes being ignored completely
33  * since they are internal to the core CLDR classes. This approach ensures that {@code CldrPath}
34  * instances uniquely identify values and can be used as keys in maps.
35  *
36  * <p>When viewing {@code CldrPath} as strings, it is sometimes necessary to introduce an suffix to
37  * the path element name to indicate the "sort order". This is necessary to represent "ordered"
38  * elements which can appear in the LDML data (though these are rare). This suffix takes the form of
39  * {@code "{element-name}#{sort-index}"} where {@code {sort-index}} is a non-negative index which
40  * corresponds to the value returned from {@link #getSortIndex()} for that path element. The natural
41  * ordering of {@code CldrPath} handles the correctly, but you cannot sort paths properly by just
42  * comparing their string representation.
43  *
44  * <p>CldrPath is an immutable value type with efficient equality semantics.
45  *
46  * <p>See <a href="https://www.unicode.org/reports/tr35/#Definitions">the LDML specification</a>
47  * for more details.
48  */
49 public final class CldrPath implements AttributeSupplier, Comparable<CldrPath> {
50     // This is approximate and DOES NOT promise correctness, it's mainly there to catch unexpected
51     // changes in the data and it can be updated or removed if needed. At the very least element
52     // names must not contain '/', '[', ']', '@', '=', '#' or '"' to permit re-parsing of paths.
53     private static final CharMatcher NAME_CHARACTERS =
54         inRange('a', 'z').or(inRange('A', 'Z')).or(inRange('0', '9')).or(anyOf(":-_"));
55 
56     /**
57      * Parses a distinguishing CLDR path string, including "distinguishing" and private "metadata"
58      * attributes into a normalized {@link CldrPath} instance. Attributes will be parsed and handled
59      * according to their type:
60      * <ul>
61      * <li>Distinguishing attributes will be added to the returned CldrPath instance.
62      * <li>Non-public metadata attributes will be ignored.
63      * <li>Value attributes are not permitted in distinguishing paths and will cause an error.
64      * </ul>
65      *
66      * <p>The path string must be structured correctly (e.g. "//ldml/foo[@bar="baz]") and must
67      * represent a known DTD type, based on the first path element (e.g. "ldml" or
68      * "supplementalData").
69      *
70      * @param path the distinguishing path string, containing only distinguishing attributes.
71      * @return the parsed distinguishing path instance.
72      * @throws IllegalArgumentException if the path is not well formed (e.g. contains unexpected
73      *      value or metadata attributes).
74      */
parseDistinguishingPath(String path)75     public static CldrPath parseDistinguishingPath(String path) {
76         return CldrPaths.processXPath(path, ImmutableList.of(), (k, v) -> {
77             throw new IllegalArgumentException(String.format(
78                 "unexpected value attribute '%s' in distinguishing path: %s", k, path));
79         });
80     }
81 
82     /**
83      * Returns the number of common path elements from the root. This is useful when determining
84      * the last common ancestor during visitation. A {@code null} path is treated as having zero
85      * length (and will always result in zero being returned).
86      *
87      * <p>Note: This is only currently use by PrefixVisitorHost, but could be made public if needed.
88      * It's only here (rather than in PrefixVisitorHost) because it depends on private methods of
89      * this class.
90      */
getCommonPrefixLength( CldrPath a, CldrPath b)91     static int getCommonPrefixLength(/* @Nullable */ CldrPath a, /* @Nullable */ CldrPath b) {
92         // Null paths are sentinels with zero length.
93         if (a == null || b == null) {
94             return 0;
95         }
96 
97         // Trim whichever path is longer until both are same length.
98         int minLength = Math.min(a.getLength(), b.getLength());
99         while (a.getLength() > minLength)
100             a = a.getParent();
101         while (b.getLength() > minLength)
102             b = b.getParent();
103 
104         // Work up the paths, resetting the common length every time the elements differ.
105         int commonLength = minLength;
106         for (int length = minLength; length > 0; length--) {
107             if (!a.localEquals(b)) {
108                 // Elements differ, so shortest possible common length is that of our parent path.
109                 commonLength = length - 1;
110             }
111             // Parents will both be null on the last iteration, but never used.
112             a = a.getParent();
113             b = b.getParent();
114         }
115         return commonLength;
116     }
117 
118     // If the parent is null then this path element is the top level LDML descriptor.
119     /* @Nullable */ private final CldrPath parent;
120     // The number of elements (including this one) in this path.
121     private final int length;
122     private final String elementName;
123     // The attribute keys and values in an alternating list.
124     private final ImmutableList<String> attributeKeyValuePairs;
125     // Inherits the top-most draft status in a path (which matches what CLDRFile appears to do.
126     private final Optional<CldrDraftStatus> draftStatus;
127     // The sort index for "ORDERED" path elements or (more commonly) -1 for non-ORDERED elements.
128     private final int sortIndex;
129     // The DTD type of the path (which is the same for all path elements).
130     private final CldrDataType dtdType;
131     // The proper DTD ordering for this path (based on the DTD type).
132     private final Comparator<CldrPath> ordering;
133     // Cached since ImmutableList recalculates its hash codes every time.
134     private final int hashCode;
135     // Cached on demand (it's useful during equality checking as well as toString() itself).
136     // However for elements with attributes, it's at least as large as the name and all attribute
137     // keys/values combined, so will typically double the in-memory size of the instance. We don't
138     // care about making assignment atomic however, since all values would be equal anyway.
139     private String localToString = null;
140 
CldrPath(CldrPath parent, String name, List<String> attributeKeyValuePairs, CldrDataType dtdType, CldrDraftStatus localDraftStatus, int sortIndex)141     CldrPath(CldrPath parent,
142         String name,
143         List<String> attributeKeyValuePairs,
144         CldrDataType dtdType,
145         /* @Nullable */ CldrDraftStatus localDraftStatus, int sortIndex) {
146         checkState(parent != null || dtdType.getLdmlName().equals(name),
147             "unexpected root element: expected %s, but got %s", dtdType.getLdmlName(), name);
148         this.parent = parent;
149         this.length = (parent != null ? parent.getLength() : 0) + 1;
150         this.elementName = checkValidName(name, "element");
151         this.attributeKeyValuePairs = checkKeyValuePairs(attributeKeyValuePairs);
152         this.draftStatus = resolveDraftStatus(parent, localDraftStatus);
153         // Ordered elements have a sort index of 0 or more, and un-ordered have an index of -1.
154         if (CldrPaths.isOrdered(dtdType, elementName)) {
155             checkArgument(sortIndex >= 0,
156                 "missing or invalid sort index '%s' for element: %s", sortIndex, elementName);
157         } else {
158             checkArgument(sortIndex == -1,
159                 "unexpected sort index '%s' for element: %s", sortIndex, elementName);
160         }
161         this.sortIndex = sortIndex;
162         this.dtdType = checkNotNull(dtdType);
163         this.ordering = CldrPaths.getPathComparator(dtdType);
164         this.hashCode = Objects.hash(length, name, this.attributeKeyValuePairs, parent);
165     }
166 
checkValidName(String value, String description)167     private static String checkValidName(String value, String description) {
168         checkArgument(!value.isEmpty(), "%s name cannot be empty", description);
169         checkArgument(NAME_CHARACTERS.matchesAllOf(value),
170             "invalid character in %s name: %s", description, value);
171         return value;
172     }
173 
checkKeyValuePairs(List<String> keyValuePairs)174     private static ImmutableList<String> checkKeyValuePairs(List<String> keyValuePairs) {
175         // Ensure attribute values never have double-quote in them (since current we don't escape
176         // value when putting into toString().
177         checkArgument((keyValuePairs.size() & 1) == 0,
178             "key/value pairs must have an even number of elements: %s", keyValuePairs);
179         for (int n = 0; n < keyValuePairs.size(); n += 2) {
180             checkValidName(keyValuePairs.get(n), "attribute");
181             String v = keyValuePairs.get(n + 1);
182             checkArgument(!v.contains("\""), "unsupported '\"' in attribute value: %s", v);
183         }
184         return ImmutableList.copyOf(keyValuePairs);
185     }
186 
187     /**
188      * Helper method used to test for equivalent path element content. Used to help avoid
189      * unnecessary allocations during processing (see {@link CldrFileDataSource}).
190      */
matchesContent( String name, int sortIndex, List<String> keyValuePairs, CldrDraftStatus draftStatus)191     boolean matchesContent(
192         String name,
193         int sortIndex,
194         List<String> keyValuePairs, /* Nullable */ CldrDraftStatus draftStatus) {
195         return this.elementName.equals(name)
196             && this.sortIndex == sortIndex
197             && this.attributeKeyValuePairs.equals(keyValuePairs)
198             && this.draftStatus.equals(resolveDraftStatus(this.parent, draftStatus));
199     }
200 
201     // Helper to resolve the current draft status of a path based on any local draft status
202     // attributes and any existing parent status.
203     //
204     // Note that this behaviour currently matches CLDRFile behaviour as of May 2019. CLDRFile
205     // does a regex match over the full path and finds the first draft status attribute that's
206     // present, ignoring any later declarations (even if they are more restrictive). It seems
207     // likely that the implicit expectation in CLDRFile is that there's only ever one draft
208     // status attributes in any given path (see the pop() method in MyDeclHandler).
resolveDraftStatus( CldrPath parent, CldrDraftStatus localStatus)209     private static Optional<CldrDraftStatus> resolveDraftStatus(
210         /* @Nullable */ CldrPath parent, /* @Nullable */ CldrDraftStatus localStatus) {
211         if (parent != null && parent.draftStatus.isPresent()) {
212             return parent.draftStatus;
213         }
214         return localStatus != null ? localStatus.asOptional() : Optional.empty();
215     }
216 
217     /**
218      * Returns the parent path element.
219      *
220      * @return the parent path element (or {@code null} if this was the root element).
221      */
222     /* @Nullable */
getParent()223     public CldrPath getParent() {
224         return parent;
225     }
226 
227     /**
228      * Returns the non-zero length of this path (i.e. the number of elements it is made up of).
229      *
230      * @return the number of elements in this path.
231      */
getLength()232     public int getLength() {
233         return length;
234     }
235 
236     /**
237      * Returns the name of this path element. This is the qualified XML name as it appears in the
238      * CLDR XML files, including namespace (e.g. "icu:transforms") though most attributes are not
239      * part of an explicit namespace.
240      *
241      * @return the ASCII-only name of the "lowest" element in this path.
242      */
243     // TODO: This method name is weak - perhaps getElementName() ?
getName()244     public String getName() {
245         return elementName;
246     }
247 
248     /**
249      * Returns the sort index of an "ordered" path element, or {@code -1} for non-ordered elements.
250      *
251      * <p>The sort index is used to disambiguate and sort otherwise identical distinguishing paths.
252      * The feature allows a series of values to be processed reliably in an ordered sequence. It is
253      * recommended that if your data includes "ordered" elements, they are always processed in
254      * {@link org.unicode.cldr.api.CldrData.PathOrder#DTD DTD} order.
255      *
256      * @return the element's sort index (which takes priority for element ordering over any
257      *     attribute values).
258      */
getSortIndex()259     public int getSortIndex() {
260         return sortIndex;
261     }
262 
263     /**
264      * Returns the raw value of an attribute associated with this CLDR path, or null if not present.
265      * For almost all use cases it is preferable to use the accessor methods on the {@link
266      * AttributeKey} class, which provide additional useful semantic checking and common type
267      * conversion. You should only use this method directly if there's a strong performance
268      * requirement.
269      *
270      * @param key the key identifying an attribute.
271      * @return the attribute value or {@code null} if not present.
272      * @see AttributeKey
273      */
274     @Override
get(AttributeKey key)275     /* @Nullable */ public String get(AttributeKey key) {
276         checkArgument(!getDataType().isValueAttribute(key),
277             "cannot get 'value attribute' values from a distinguishing path: %s", key);
278         String v = null;
279         for (CldrPath p = this; v == null && p != null; p = p.getParent()) {
280             if (p.getName().equals(key.getElementName())) {
281                 v = p.getLocalAttributeValue(key.getAttributeName());
282             }
283         }
284         return v;
285     }
286 
287     /**
288      * Returns the data type for this path, as defined by the top most element.
289      *
290      * @return the path's data type.
291      */
292     @Override
getDataType()293     public CldrDataType getDataType() {
294         return dtdType;
295     }
296 
297     /**
298      * Returns whether the given element name is in this path.
299      *
300      * @param elementName the element name to check.
301      * @return true if the name of this path element or any of its parents is equal to the given
302      *     element name.
303      */
containsElement(String elementName)304     boolean containsElement(String elementName) {
305         return getName().equals(elementName)
306             || (parent != null && parent.containsElement(elementName));
307     }
308 
309     /**
310      * Returns the number of distinguishing attributes for this path element.
311      *
312      * @return the number of distinguishing attributes for the "lowest" element in this path.
313      */
getAttributeCount()314     int getAttributeCount() {
315         return attributeKeyValuePairs.size() / 2;
316     }
317 
318     /**
319      * Returns the (non empty) name of the Nth distinguishing attribute in the leaf path element.
320      *
321      * @param n the index of a distinguishing attribute for the "lowest" element in this path.
322      * @return the name of the Nth attribute on this path element.
323      */
getLocalAttributeName(int n)324     String getLocalAttributeName(int n) {
325         checkElementIndex(n, getAttributeCount());
326         return attributeKeyValuePairs.get(2 * n);
327     }
328 
329     /**
330      * Returns the value of the Nth distinguishing attribute in the leaf path element.
331      *
332      * @param n the index of a distinguishing attribute for the "lowest" element in this path.
333      * @return the value of the Nth attribute on this path element.
334      */
getLocalAttributeValue(int n)335     String getLocalAttributeValue(int n) {
336         checkElementIndex(n, getAttributeCount());
337         return attributeKeyValuePairs.get((2 * n) + 1);
338     }
339 
340     // Helper to get an attribute by name from this path element.
getLocalAttributeValue(String attributeName)341     private String getLocalAttributeValue(String attributeName) {
342         checkNotNull(attributeName);
343         for (int i = 0; i < attributeKeyValuePairs.size(); i += 2) {
344             if (attributeName.equals(attributeKeyValuePairs.get(i))) {
345                 return attributeKeyValuePairs.get(i + 1);
346             }
347         }
348         return null;
349     }
350 
351     /**
352      * Returns the draft status for this path, which is inherited from parent path elements. Note
353      * that where multiple (possibly conflicting) draft statuses are defined in a path, the "top
354      * most" (i.e. closest to the root element) value is used.
355      *
356      * @return the potentially inherited draft status.
357      */
getDraftStatus()358     Optional<CldrDraftStatus> getDraftStatus() {
359         return draftStatus;
360     }
361 
362     /**
363      * Returns a combined full path string in the XPath style {@code //foo/bar[@x="y"]/baz},
364      * with value attributes inserted in correct DTD order for each path element.
365      *
366      * <p>Note that while in most cases the values attributes simply follow the path attributes on
367      * each element, this is not necessarily always true, and DTD ordering can place value
368      * attributes before path attributes in an element.
369      *
370      * @param value a value to be associated with this path (from which value attributes will be
371      *              obtained).
372      * @return the full XPath representation containing both distinguishing and value attributes.
373      */
getFullPath(CldrValue value)374     String getFullPath(CldrValue value) {
375         checkNotNull(value);
376         return appendToString(new StringBuilder(), value.getValueAttributes()).toString();
377     }
378 
379     /** Compares two paths in DTD order. */
380     @Override
compareTo(CldrPath other)381     public int compareTo(CldrPath other) {
382         if (dtdType == other.dtdType) {
383             return ordering.compare(this, other);
384         }
385         return dtdType.compareTo(other.dtdType);
386     }
387 
388     /** {@inheritDoc} */
389     @Override
equals(Object obj)390     public boolean equals(Object obj) {
391         if (obj == this) {
392             return true;
393         }
394         if (!(obj instanceof CldrPath)) {
395             return false;
396         }
397         CldrPath other = (CldrPath) obj;
398         // Check type and length first (checking length catches most non-equal paths).
399         if (!getDataType().equals(other.getDataType()) || length != other.length) {
400             return false;
401         }
402         // Do (n - 1) comparisons since we already know the root element is the same (the root
403         // element never has value attributes on it and is uniquely defined by the DTD type).
404         // Working up from the "leaf" of the path works well because different paths will almost
405         // always be different in the leaf element (i.e. we will fail almost immediately).
406         CldrPath pthis = this;
407         for (int n = length - 1; n > 0; n--) {
408             if (!pthis.localEquals(other)) {
409                 return false;
410             }
411             pthis = pthis.getParent();
412             other = other.getParent();
413         }
414         return true;
415     }
416 
417     /** {@inheritDoc} */
418     @Override
hashCode()419     public int hashCode() {
420         return hashCode;
421     }
422 
423     /** @return the distinguishing path string in the XPath style {@code //foo/bar[@x="y"]/baz}. */
424     @Override
toString()425     public String toString() {
426         return appendToString(new StringBuilder(), ImmutableMap.of()).toString();
427     }
428 
localEquals(CldrPath other)429     private boolean localEquals(CldrPath other) {
430         // In _theory_ only length and localToString need to be checked (since the localToString is
431         // an unambiguous representation of the data, but it seems a bit hacky to rely on that.
432         return this.elementName.equals(other.elementName)
433             && this.sortIndex == other.sortIndex
434             && this.attributeKeyValuePairs.equals(other.attributeKeyValuePairs);
435     }
436 
437     // XPath like toString() representation of a path element (e.g. foo[@bar="x"][@baz="y"]).
438     // When a sort index is present, it's appended to the element name (e.g. "foo#42[@bar="x"]").
getLocalToString()439     private String getLocalToString() {
440         if (localToString == null) {
441             String str = getName();
442             if (sortIndex != -1) {
443                 // This is very rare so it's almost certainly better to not make a StringBuilder
444                 // above just for this possibility.
445                 str += "#" + sortIndex;
446             }
447             if (getAttributeCount() > 0) {
448                 str = IntStream.range(0, getAttributeCount())
449                     .mapToObj(n ->
450                         String.format("[@%s=\"%s\"]", getLocalAttributeName(n), getLocalAttributeValue(n)))
451                     .collect(Collectors.joining("", str, ""));
452             }
453             // Overwrite only once the local string is completed (this is idempotent so we don't
454             // have to care about locking etc.).
455             localToString = str;
456         }
457         return localToString;
458     }
459 
460     // Recursive helper for toString().
appendToString( StringBuilder out, ImmutableMap<AttributeKey, String> valueAttributes)461     private StringBuilder appendToString(
462         StringBuilder out, ImmutableMap<AttributeKey, String> valueAttributes) {
463         CldrPath parent = getParent();
464         if (parent != null) {
465             parent.appendToString(out, valueAttributes).append('/');
466         } else {
467             out.append("//");
468         }
469         if (valueAttributes.isEmpty()) {
470             return out.append(getLocalToString());
471         }
472         List<String> attributeNames =
473             valueAttributes.keySet().stream()
474                 .filter(k -> k.getElementName().equals(getName()))
475                 .map(AttributeKey::getAttributeName)
476                 .collect(toCollection(ArrayList::new));
477         if (attributeNames.isEmpty()) {
478             // No value attributes for _this_ element so can use just the local toString().
479             return out.append(getLocalToString());
480         }
481         if (getAttributeCount() > 0) {
482             String lastPathAttributeName = getLocalAttributeName(getAttributeCount() - 1);
483             if (dtdType.getAttributeComparator()
484                 .compare(lastPathAttributeName, attributeNames.get(0)) > 0) {
485                 // Oops, order is not as expected, so must reorder all attributes.
486                 appendResortedValueAttributesTo(out, attributeNames, valueAttributes);
487                 return out;
488             }
489         }
490         // Value attributes all come after path attributes.
491         return appendValueAttributesTo(out.append(getLocalToString()), attributeNames, valueAttributes);
492     }
493 
appendResortedValueAttributesTo( StringBuilder out, List<String> attributeNames, ImmutableMap<AttributeKey, String> valueAttributes)494     private void appendResortedValueAttributesTo(
495         StringBuilder out,
496         List<String> attributeNames,
497         ImmutableMap<AttributeKey, String> valueAttributes) {
498         out.append(elementName);
499         for (int n = 0; n < attributeKeyValuePairs.size(); n += 2) {
500             attributeNames.add(attributeKeyValuePairs.get(n));
501         }
502         attributeNames.sort(dtdType.getAttributeComparator());
503         for (String attrName : attributeNames) {
504             String value = getLocalAttributeValue(attrName);
505             if (value == null) {
506                 value = valueAttributes.get(AttributeKey.keyOf(elementName, attrName));
507                 checkState(value != null, "missing value %s:%s", elementName, attrName);
508             }
509             out.append(String.format("[@%s=\"%s\"]", attrName, value));
510         }
511     }
512 
appendValueAttributesTo( StringBuilder out, List<String> attributeNames, ImmutableMap<AttributeKey, String> valueAttributes)513     private StringBuilder appendValueAttributesTo(
514         StringBuilder out,
515         List<String> attributeNames,
516         ImmutableMap<AttributeKey, String> valueAttributes) {
517         for (String attrName : attributeNames) {
518             String value = valueAttributes.get(AttributeKey.keyOf(elementName, attrName));
519             checkState(value != null, "missing value %s:%s", elementName, attrName);
520             out.append(String.format("[@%s=\"%s\"]", attrName, value));
521         }
522         return out;
523     }
524 }
525