1 package org.unicode.cldr.api;
2 
3 import static com.google.common.base.Preconditions.checkArgument;
4 import static com.google.common.base.Preconditions.checkState;
5 import static java.lang.Integer.signum;
6 import static org.unicode.cldr.api.CldrDataType.LDML;
7 
8 import java.util.ArrayList;
9 import java.util.Comparator;
10 import java.util.List;
11 import java.util.Map;
12 import java.util.Map.Entry;
13 import java.util.function.BiConsumer;
14 import java.util.function.Consumer;
15 import java.util.function.Predicate;
16 import java.util.stream.Stream;
17 
18 import org.unicode.cldr.util.DtdData;
19 import org.unicode.cldr.util.XPathParts;
20 
21 import com.google.common.collect.ImmutableMap;
22 import com.google.common.collect.ImmutableSetMultimap;
23 
24 /**
25  * Utilities related to CLDR paths. It's possible that one day we might wish to expose the path
26  * comparator from this class, but if so we should key it off something other than {@code DtdType}.
27  */
28 final class CldrPaths {
29     // Constants to make comparator logic a bit more readable (i.e. LHS < RHS ==> return -1).
30     private static final int LHS_FIRST = -1;
31     private static final int RHS_FIRST = 1;
32 
33     // A synthetic attribute used by CLDR code to apply an explicit ordering to otherwise identical
34     // paths (this only happens for "ORDERED" elements). We handle this differently and have the
35     // sort index as an explicit field in CldrPath rather than treating it as an attribute.
36     private static final String HIDDEN_SORT_INDEX_ATTRIBUTE = "_q";
37 
38     // When calculating things about elements we need to ignore deprecated ones, especially when
39     // looking for "leaf" elements, since CLDR data used to allow mixed content and the DTD still
40     // has elements with both data and (deprecated) child elements present.
41     private static final Predicate<DtdData.Element> IS_NOT_DEPRECATED = e -> !e.isDeprecated();
42 
43     // A map of the leaf element names for each supported DTD.
44     private static final ImmutableSetMultimap<CldrDataType, String> LEAF_ELEMENTS_MAP;
45     // A map of the ordered element names for each supported DTD.
46     private static final ImmutableSetMultimap<CldrDataType, String> ORDERED_ELEMENTS_MAP;
47     static {
48         ImmutableSetMultimap.Builder<CldrDataType, String> leafElementsMap =
49             ImmutableSetMultimap.builder();
50         ImmutableSetMultimap.Builder<CldrDataType, String> orderedElementsMap =
51             ImmutableSetMultimap.builder();
52         for (CldrDataType type : CldrDataType.values()) {
53             // While at happened to be true (at the time of writing) that the getElements() method
54             // returns a new, mutable set, this is completely undocumented so we cannot rely on it
55             // (otherwise we could just do "removeIf(...)").
56             //
57             // Note that while the "specials" element has no children in the DTD, it is not
58             // considered a "leaf" element as it is expected to contain any additional elements
59             // from unknown namespaces (e.g. "icu:").
60             //
61             // We know that Guava's collection classes have a policy of never iterating over a
62             // collection more than once, so it's safe to use the ::iterator trick to convert a
63             // stream into a one-shot iterable (saves have to make temporary collections).
leafElementsMap.putAll( type, type.getElements() .filter(IS_NOT_DEPRECATED) .filter(e -> e.getChildren().keySet().stream().noneMatch(IS_NOT_DEPRECATED)) .map(DtdData.Element::getName) .filter(e -> !e.equals("special")) ::iterator)64             leafElementsMap.putAll(
65                 type,
66                 type.getElements()
67                     .filter(IS_NOT_DEPRECATED)
68                     // NOTE: Some leaf elements still have deprecated children (from when mixed
69                     // data was permitted).
70                     .filter(e -> e.getChildren().keySet().stream().noneMatch(IS_NOT_DEPRECATED))
71                     .map(DtdData.Element::getName)
72                     .filter(e -> !e.equals("special"))
73                     ::iterator);
orderedElementsMap.putAll( type, type.getElements() .filter(IS_NOT_DEPRECATED) .filter(DtdData.Element::isOrdered) .map(DtdData.Element::getName) ::iterator)74             orderedElementsMap.putAll(
75                 type,
76                 type.getElements()
77                     .filter(IS_NOT_DEPRECATED)
78                     .filter(DtdData.Element::isOrdered)
79                     .map(DtdData.Element::getName)
80                     ::iterator);
81         }
82         // Special case "alias" is an alternate leaf element for a lot of LDML elements.
leafElementsMap.put(LDML, "alias")83         leafElementsMap.put(LDML, "alias");
84         LEAF_ELEMENTS_MAP = leafElementsMap.build();
85         ORDERED_ELEMENTS_MAP = orderedElementsMap.build();
86     }
87 
88     // A map of path comparators for each supported DTD.
89     private static final ImmutableMap<CldrDataType, DtdPathComparator> PATH_COMPARATOR_MAP;
90 
91     // This static block must come after the attribute map is created since it uses it.
92     static {
93         ImmutableMap.Builder<CldrDataType, DtdPathComparator> pathMap = ImmutableMap.builder();
94         for (CldrDataType type : CldrDataType.values()) {
pathMap.put(type, new DtdPathComparator(type))95             pathMap.put(type, new DtdPathComparator(type));
96         }
97         PATH_COMPARATOR_MAP = pathMap.build();
98     }
99 
100     /**
101      * Returns a comparator for {@link CldrPath}s which is compatible with the canonical path
102      * ordering for the given DTD instance (e.g. {@link DtdData#getDtdComparator}()).
103      */
104     // TODO: Add tests to ensure it continues to agree to DTD ordering.
getPathComparator(CldrDataType type)105     static Comparator<CldrPath> getPathComparator(CldrDataType type) {
106         return PATH_COMPARATOR_MAP.get(type);
107     }
108 
109     private static final class DtdPathComparator implements Comparator<CldrPath> {
110         private final Comparator<String> elementNameComparator;
111         private final Comparator<String> attributeNameComparator;
112 
DtdPathComparator(CldrDataType dataType)113         private DtdPathComparator(CldrDataType dataType) {
114             this.elementNameComparator = dataType.getElementComparator();
115             this.attributeNameComparator = dataType.getAttributeComparator();
116         }
117 
118         // This code should only return "signum" values for ordering (i.e. {-1, 0, 1}).
119         @Override
compare(CldrPath lhs, CldrPath rhs)120         public int compare(CldrPath lhs, CldrPath rhs) {
121             int length = lhs.getLength();
122             if (length == rhs.getLength()) {
123                 if (length == 1) {
124                     // Root nodes are special as they define the DTD type and must always be equal.
125                     // Paths with different types can be compared, but not via this comparator.
126                     checkState(lhs.getName().equals(rhs.getName()),
127                         "cannot compare paths with different DTD type: %s / %s", lhs, rhs);
128                     return 0;
129                 }
130                 // Compare parent paths first and return if they give an ordering.
131                 int signum = compare(lhs.getParent(), rhs.getParent());
132                 return (signum != 0) ? signum : compareCurrentElement(lhs, rhs);
133             } else if (length < rhs.getLength()) {
134                 // Recursively shorten the RHS path until it's the same length.
135                 int signum = compare(lhs, rhs.getParent());
136                 // If the LHS is equal to the RHS parent, then the (shorter) LHS path comes first.
137                 // Note this this can only happen if we are comparing non-leaf node paths.
138                 return signum != 0 ? signum : LHS_FIRST;
139             } else {
140                 // Flip the comparison if LHS was longer (we do this at most once per comparison).
141                 return -compare(rhs,  lhs);
142             }
143         }
144 
compareCurrentElement(CldrPath lhs, CldrPath rhs)145         private int compareCurrentElement(CldrPath lhs, CldrPath rhs) {
146             String elementName = lhs.getName();
147             int signum = signum(elementNameComparator.compare(elementName, rhs.getName()));
148             if (signum != 0) {
149                 return signum;
150             }
151             // Primary order within a path element is defined by the element index (this is a
152             // bit of a special value used only for "ORDERED" elements in the DTD. This always
153             // trumps any attribute ordering but is almost always -1.
154             signum = Integer.compare(lhs.getSortIndex(), rhs.getSortIndex());
155             if (signum != 0) {
156                 return signum;
157             }
158             // Element name is the same, so test attributes. Attributes are already known to be
159             // ordered by the element's DTD order, so we only need to find and compare the first
160             // difference.
161             int minAttributeCount =
162                 Math.min(lhs.getAttributeCount(), rhs.getAttributeCount());
163             for (int n = 0; n < minAttributeCount && signum == 0; n++) {
164                 String attributeName = lhs.getLocalAttributeName(n);
165                 // Important: We negate the comparison result here because we want elements with
166                 // "missing" attributes to sort earlier.
167                 //
168                 // E.g. for two elements LHS="foo[a=x][b=y]" and RHS="foo[b=y]" we want to say
169                 // that "RHS < LHS" because RHS is "missing" the attribute "[a=x]". But when we
170                 // compare the first attributes we find "a" (LHS) < "b" (RHS), which is the
171                 // opposite of what we want. This is because while LHS has a lower ordered
172                 // attribute, that indicates that RHS is missing that attribute in the same
173                 // position, which should make RHS sort first.
174                 signum = -signum(
175                     attributeNameComparator.compare(attributeName, rhs.getLocalAttributeName(n)));
176                 if (signum == 0) {
177                     // Attribute names equal, so test attribute value.
178                     signum = signum(
179                         DtdData.getAttributeValueComparator(elementName, attributeName)
180                             .compare(lhs.getLocalAttributeValue(n), rhs.getLocalAttributeValue(n)));
181                 }
182             }
183             if (signum == 0) {
184                 // Attributes match up to the minimum length, but one element might have more.
185                 // Elements with more attributes sort _after_ those without.
186                 if (lhs.getAttributeCount() > minAttributeCount) {
187                     signum = RHS_FIRST;
188                 } else if (rhs.getAttributeCount() > minAttributeCount) {
189                     signum = LHS_FIRST;
190                 }
191             }
192             return signum;
193         }
194     }
195 
196     /** Returns whether this path is a full path ending in a "leaf" element. */
isLeafPath(CldrPath path)197     static boolean isLeafPath(CldrPath path) {
198         String lastElementName = path.getName();
199         return lastElementName.indexOf(':') != -1
200             || LEAF_ELEMENTS_MAP.get(path.getDataType()).contains(lastElementName);
201     }
202 
203     /**
204      * Returns whether an element is "ORDERED" in the specified DTD (and should have an explicit
205      * sort index).
206      */
isOrdered(CldrDataType dtdType, String elementName)207     static boolean isOrdered(CldrDataType dtdType, String elementName) {
208         // Elements with namespaces unknown the the DTD are never ordered.
209         if (elementName.indexOf(':') != -1) {
210             return false;
211         }
212         // Returns empty set if DTD unknown, but it might also be the case that a valid DTD has no
213         // ordered elements, so we can't reasonable check for anything here.
214         return ORDERED_ELEMENTS_MAP.get(dtdType).contains(elementName);
215     }
216 
217     // This can't go further up due to static initialization ordering issues.
218     // TODO: Move all reading of DTDs and setup for paths into a lazy holder.
219     private static final CldrPath LDML_VERSION =
220         CldrPath.parseDistinguishingPath("//ldml/identity/version");
221 
222     /**
223      * Returns whether this path should be emitted by a data supplier. New cases can be added
224      * as needed.
225      */
shouldEmit(CldrPath path)226     static boolean shouldEmit(CldrPath path) {
227         // CLDRFile seems to do some interesting things with the version based on the DTD in
228         // which we see:
229         //
230         // <!ATTLIST version number CDATA #REQUIRED >
231         //    <!--@MATCH:regex/\$Revision.*\$-->
232         //    <!--@METADATA-->
233         //
234         // <!ATTLIST version cldrVersion CDATA #FIXED "36" >
235         //    <!--@MATCH:any-->
236         //    <!--@VALUE-->
237         //
238         // This results in conflict between values obtained via CLDRFile and those obtained
239         // directly by parsing an LDML XML file (which happens for "special" XML files in ICU).
240         //
241         // So for now I've decided to just ignore the version (since it's hardly used in the
242         // ICU converter code and always available via CldrDataSupplier#getCldrVersionString().
243 
244         // Hacky way to detect the version declaration in LDML files (which must always be
245         // skipped since they are duplicate paths and reveal XML file boundaries). The path is
246         // always "//ldmlXxxx/version" for some DTD type.
247         if (path.getLength() == 2 && path.getName().equals("version")) {
248             return false;
249         }
250 
251         // Note that there is a need for some kind of versioning for some bits of data (since
252         // changes to things like collation can invalidate database search indexes) but this should
253         // be handled directly in the logic which builds the ICU data and isn't strictly the same
254         // as the LDML version anyway.
255         //
256         // TODO: Remove this code if and when LDML version strings are removed.
257         return !path.equals(LDML_VERSION);
258     }
259 
260     /**
261      * Processes a full path to extract a distinguishing CldrPath and handle any value attributes
262      * present. This is designed for iterating over successive paths from CLDRFile, but can be used
263      * to create paths in isolation if necessary.
264      *
265      * @param fullPath a parsed full path.
266      * @param previousElements an optional list of previous CldrPath elements which will be used
267      *     as the prefix to the new path wherever possible (e.g. if previousElements="(a,b,c,d)"
268      *     and "fullPath=a/b/x/y/z", then the new path will share the path prefix up to and
269      *     including 'b'). When processing sorted paths, this will greatly reduce allocations.
270      * @param valueAttributeCollector a collector into which value attributes will be added (in
271      *     DTD order).
272      * @return a new CldrPath corresponding to the distinguishing path of {@code fullPath}.
273      * @throws IllegalArgumentException if the path string is invalid.
274      */
processXPath( String fullPath, List<CldrPath> previousElements, BiConsumer<AttributeKey, String> valueAttributeCollector)275     static CldrPath processXPath(
276         String fullPath,
277         List<CldrPath> previousElements,
278         BiConsumer<AttributeKey, String> valueAttributeCollector) {
279         checkArgument(!fullPath.isEmpty(), "path must not be empty");
280         // This fails if attribute names are invalid, but not if element names are invalid, and we
281         // want to control/stabalize the error messages in this API.
282         XPathParts pathParts;
283         try {
284             pathParts = XPathParts.getFrozenInstance(fullPath);
285         } catch (IllegalArgumentException e) {
286             throw new IllegalArgumentException("invalid path: " + fullPath);
287         }
288         int length = pathParts.size();
289         checkArgument(length > 0, "cldr path must not be empty: %s", pathParts);
290         DtdData dtd = pathParts.getDtdData();
291         checkArgument(dtd != null, "unknown DTD type: %s", pathParts);
292         CldrDataType dtdType = CldrDataType.forRawType(dtd.dtdType);
293 
294         // The path we're returning, constructed from top-to-bottom.
295         CldrPath path = null;
296         // Reusable key/value attributes list.
297         List<String> keyValuePairs = new ArrayList<>();
298         Consumer<Entry<String, String>> collectElementAttribute = e -> {
299             keyValuePairs.add(e.getKey());
300             keyValuePairs.add(e.getValue());
301         };
302         // True once we've started to create a new path rather than reusing previous elements.
303         boolean diverged = false;
304         for (int n = 0; n < length; n++) {
305             String elementName = pathParts.getElement(n);
306 
307             // If this path is from CldrPath.toString() then the sort index is encoded in the
308             // element name suffix (e.g. foo#42[@bar="x"]), otherwise it's in the synthetic "_q"
309             // attribute. Most often there is no sort index however, so this code should optimize
310             // to the null case.
311             int sortIndex = -1;
312             Map<String, String> attributes = pathParts.getAttributes(n);
313             int nameEnd = elementName.indexOf('#');
314             if (nameEnd != -1) {
315                 sortIndex = Integer.parseUnsignedInt(elementName.substring(nameEnd + 1));
316                 elementName = elementName.substring(0, nameEnd);
317             } else {
318                 String sortIndexStr = attributes.get(HIDDEN_SORT_INDEX_ATTRIBUTE);
319                 if (sortIndexStr != null) {
320                     sortIndex = Integer.parseUnsignedInt(sortIndexStr);
321                 }
322             }
323 
324             // Note that element names need not be known to the DTD. If an element's name is
325             // prefixed with an unknown namespace (e.g. "icu:") then it is always permitted.
326             // Similarly, we never filter out attributes in unknown namespaces. For now we assume
327             // that any explicit namespace is unknown.
328             boolean hasNamespace = elementName.indexOf(':') != -1;
329             checkArgument(hasNamespace || dtd.getElementFromName().containsKey(elementName),
330                 "invalid path: %s", fullPath);
331 
332             // The keyValuePairs list is used by the collectElementAttribute callback but we don't
333             // want/need to make a new callback each time, so just clear the underlying list.
334             keyValuePairs.clear();
335             processPathAttributes(
336                 elementName, attributes, dtdType, collectElementAttribute, valueAttributeCollector);
337 
338             // WARNING: We cannot just get the draft attribute value from the attributes map (as
339             // would be expected) because it throws an exception. This is because you can only
340             // "get()" values from the attributes map if they are potentially valid attributes for
341             // that element. Unfortunately this is due to a deliberate choice in the implementation
342             // of MapComparator, which explicitly throws an exception if an unknown attribute is
343             // encountered. Thus we have to check that "draft" is a possible attribute before
344             // asking for its value.
345             //
346             // TODO(dbeaumont): Fix this properly, ideally by fixing the comparator to not throw.
347             CldrDraftStatus draftStatus = dtd.getAttribute(elementName, "draft") != null
348                 ? CldrDraftStatus.forString(attributes.get("draft")) : null;
349 
350             if (!diverged && n < previousElements.size()) {
351                 CldrPath p = previousElements.get(n);
352                 if (p.matchesContent(elementName, sortIndex, keyValuePairs, draftStatus)) {
353                     // The previous path we processed was the same at least down to this depth, so
354                     // we can just reuse that element instead of creating a new one.
355                     //
356                     // In tests over the resolved paths for "en_GB" in DTD order, there are ~128k
357                     // path elements processed, with ~103k being reused here and ~25k being newly
358                     // allocated (a > 80% reuse rate). However this works best when the incoming
359                     // paths are sorted, since otherwise the "previous" path is random.
360                     //
361                     // For unsorted paths, the reuse rate is reduced to approximately 25%. This is
362                     // still saving ~32k allocations for the "en_GB" example, and it still saved
363                     // about 35% in terms of measured time to process the paths.
364                     path = p;
365                     continue;
366                 }
367             }
368             // This path has diverged from the previous path, so we must start making new elements.
369             path = new CldrPath(path, elementName, keyValuePairs, dtdType, draftStatus, sortIndex);
370             diverged = true;
371         }
372         return path;
373     }
374 
375     // Returns the element's sort index (or -1 if not present).
processPathAttributes( String elementName, Map<String, String> attributeMap, CldrDataType dtdType, Consumer<Entry<String, String>> collectElementAttribute, BiConsumer<AttributeKey, String> collectValueAttribute)376     static void processPathAttributes(
377         String elementName,
378         /* @Nullable */ Map<String, String> attributeMap,
379         CldrDataType dtdType,
380         Consumer<Entry<String, String>> collectElementAttribute,
381         BiConsumer<AttributeKey, String> collectValueAttribute) {
382 
383         // Why not just a Map? Maps don't efficiently handle "get the Nth element" which is
384         // something that's used a lot in the ICU conversion code. Distinguishing attributes in
385         // paths are used more as a "list of pairs" than a map with key/value lookup. Even
386         // extensions like "NavigableMap" don't give you what you want.
387         //
388         // This does mean that lookup-by-name is a linear search (unless you want to pay the cost
389         // of also having a map here) but the average number of attributes is very small (<3).
390         if (attributeMap != null && !attributeMap.isEmpty()) {
391             processAttributes(
392                 attributeMap.entrySet().stream(), elementName, collectValueAttribute, dtdType)
393                 .forEach(collectElementAttribute);
394         }
395     }
396 
processAttributes( Stream<Entry<String, String>> in, String elementName, BiConsumer<AttributeKey, String> collectValueAttribute, CldrDataType dtdType)397     static Stream<Entry<String, String>> processAttributes(
398         Stream<Entry<String, String>> in,
399         String elementName,
400         BiConsumer<AttributeKey, String> collectValueAttribute,
401         CldrDataType dtdType) {
402         Consumer<Entry<String, String>> collectValueAttributes = e -> {
403             if (dtdType.isValueAttribute(elementName, e.getKey())) {
404                 collectValueAttribute.accept(AttributeKey.keyOf(elementName, e.getKey()), e.getValue());
405             }
406         };
407         return in
408             // Special case of a synthetic distinguishing attribute that's _not_ in the DTD, and
409             // should be ignored.
410             .filter(e -> !e.getKey().equals(HIDDEN_SORT_INDEX_ATTRIBUTE))
411             .peek(collectValueAttributes)
412             .filter(e -> dtdType.isDistinguishingAttribute(elementName, e.getKey()));
413     }
414 
CldrPaths()415     private CldrPaths() {}
416 }
417