1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // locdistance.h
5 // created: 2019may08 Markus W. Scherer
6 
7 #ifndef __LOCDISTANCE_H__
8 #define __LOCDISTANCE_H__
9 
10 #include "unicode/utypes.h"
11 #include "unicode/bytestrie.h"
12 #include "unicode/localematcher.h"
13 #include "unicode/locid.h"
14 #include "unicode/uobject.h"
15 #include "lsr.h"
16 
17 U_NAMESPACE_BEGIN
18 
19 struct LocaleDistanceData;
20 
21 /**
22  * Offline-built data for LocaleMatcher.
23  * Mostly but not only the data for mapping locales to their maximized forms.
24  */
25 class LocaleDistance final : public UMemory {
26 public:
27     static const LocaleDistance *getSingleton(UErrorCode &errorCode);
28 
shiftDistance(int32_t distance)29     static int32_t shiftDistance(int32_t distance) {
30         return distance << DISTANCE_SHIFT;
31     }
32 
getShiftedDistance(int32_t indexAndDistance)33     static int32_t getShiftedDistance(int32_t indexAndDistance) {
34         return indexAndDistance & DISTANCE_MASK;
35     }
36 
getDistanceDouble(int32_t indexAndDistance)37     static double getDistanceDouble(int32_t indexAndDistance) {
38         double shiftedDistance = getShiftedDistance(indexAndDistance);
39         return shiftedDistance / (1 << DISTANCE_SHIFT);
40     }
41 
getDistanceFloor(int32_t indexAndDistance)42     static int32_t getDistanceFloor(int32_t indexAndDistance) {
43         return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
44     }
45 
getIndex(int32_t indexAndDistance)46     static int32_t getIndex(int32_t indexAndDistance) {
47         // assert indexAndDistance >= 0;
48         return indexAndDistance >> INDEX_SHIFT;
49     }
50 
51     /**
52      * Finds the supported LSR with the smallest distance from the desired one.
53      * Equivalent LSR subtags must be normalized into a canonical form.
54      *
55      * <p>Returns the index of the lowest-distance supported LSR in the high bits
56      * (negative if none has a distance below the threshold),
57      * and its distance (0..ABOVE_THRESHOLD) in the low bits.
58      */
59     int32_t getBestIndexAndDistance(const LSR &desired,
60                                     const LSR **supportedLSRs, int32_t supportedLSRsLength,
61                                     int32_t shiftedThreshold,
62                                     ULocMatchFavorSubtag favorSubtag,
63                                     ULocMatchDirection direction) const;
64 
65     UBool isParadigmLSR(const LSR &lsr) const;
66 
getDefaultScriptDistance()67     int32_t getDefaultScriptDistance() const {
68         return defaultScriptDistance;
69     }
70 
getDefaultDemotionPerDesiredLocale()71     int32_t getDefaultDemotionPerDesiredLocale() const {
72         return defaultDemotionPerDesiredLocale;
73     }
74 
75 private:
76     // The distance is shifted left to gain some fraction bits.
77     static constexpr int32_t DISTANCE_SHIFT = 3;
78     static constexpr int32_t DISTANCE_FRACTION_MASK = 7;
79     // 7 bits for 0..100
80     static constexpr int32_t DISTANCE_INT_SHIFT = 7;
81     static constexpr int32_t INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT;
82     static constexpr int32_t DISTANCE_MASK = 0x3ff;
83     // tic constexpr int32_t MAX_INDEX = 0x1fffff;  // avoids sign bit
84     static constexpr int32_t INDEX_NEG_1 = 0xfffffc00;
85 
86     LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely);
87     LocaleDistance(const LocaleDistance &other) = delete;
88     LocaleDistance &operator=(const LocaleDistance &other) = delete;
89 
90     static void initLocaleDistance(UErrorCode &errorCode);
91 
isMatch(const LSR & desired,const LSR & supported,int32_t shiftedThreshold,ULocMatchFavorSubtag favorSubtag)92     UBool isMatch(const LSR &desired, const LSR &supported,
93                   int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const {
94         const LSR *pSupp = &supported;
95         return getBestIndexAndDistance(
96             desired, &pSupp, 1,
97             shiftedThreshold, favorSubtag, ULOCMATCH_DIRECTION_WITH_ONE_WAY) >= 0;
98     }
99 
100     static int32_t getDesSuppScriptDistance(BytesTrie &iter, uint64_t startState,
101                                             const char *desired, const char *supported);
102 
103     static int32_t getRegionPartitionsDistance(
104         BytesTrie &iter, uint64_t startState,
105         const char *desiredPartitions, const char *supportedPartitions,
106         int32_t threshold);
107 
108     static int32_t getFallbackRegionDistance(BytesTrie &iter, uint64_t startState);
109 
110     static int32_t trieNext(BytesTrie &iter, const char *s, bool wantValue);
111 
partitionsForRegion(const LSR & lsr)112     const char *partitionsForRegion(const LSR &lsr) const {
113         // ill-formed region -> one non-matching string
114         int32_t pIndex = regionToPartitionsIndex[lsr.regionIndex];
115         return partitionArrays[pIndex];
116     }
117 
getDefaultRegionDistance()118     int32_t getDefaultRegionDistance() const {
119         return defaultRegionDistance;
120     }
121 
122     const XLikelySubtags &likelySubtags;
123 
124     // The trie maps each dlang+slang+dscript+sscript+dregion+sregion
125     // (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance.
126     // There is also a trie value for each subsequence of whole subtags.
127     // One '*' is used for a (desired, supported) pair of "und", "Zzzz"/"", or "ZZ"/"".
128     BytesTrie trie;
129 
130     /**
131      * Maps each region to zero or more single-character partitions.
132      */
133     const uint8_t *regionToPartitionsIndex;
134     const char **partitionArrays;
135 
136     /**
137      * Used to get the paradigm region for a cluster, if there is one.
138      */
139     const LSR *paradigmLSRs;
140     int32_t paradigmLSRsLength;
141 
142     int32_t defaultLanguageDistance;
143     int32_t defaultScriptDistance;
144     int32_t defaultRegionDistance;
145     int32_t minRegionDistance;
146     int32_t defaultDemotionPerDesiredLocale;
147 };
148 
149 U_NAMESPACE_END
150 
151 #endif  // __LOCDISTANCE_H__
152