1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  * ********************************************************************************
5  * Copyright (C) 2007-2011, International Business Machines Corporation and others.
6  * All Rights Reserved.
7  * ********************************************************************************
8  */
9 package com.ibm.icu.impl;
10 
11 import java.util.Iterator;
12 import java.util.LinkedList;
13 import java.util.List;
14 import java.util.ListIterator;
15 
16 import com.ibm.icu.lang.UCharacter;
17 import com.ibm.icu.text.UnicodeSet;
18 
19 /**
20  * TextTrieMap is a trie implementation for supporting
21  * fast prefix match for the key.
22  */
23 public class TextTrieMap<V> {
24 
25     private Node _root = new Node();
26     boolean _ignoreCase;
27 
28     public static class Output {
29         public int matchLength;
30         public boolean partialMatch;
31     }
32 
33     /**
34      * Constructs a TextTrieMap object.
35      *
36      * @param ignoreCase true to use simple case insensitive match
37      */
TextTrieMap(boolean ignoreCase)38     public TextTrieMap(boolean ignoreCase) {
39         _ignoreCase = ignoreCase;
40     }
41 
42     /**
43      * Adds the text key and its associated object in this object.
44      *
45      * @param text The text.
46      * @param val The value object associated with the text.
47      */
put(CharSequence text, V val)48     public TextTrieMap<V> put(CharSequence text, V val) {
49         CharIterator chitr = new CharIterator(text, 0, _ignoreCase);
50         _root.add(chitr, val);
51         return this;
52     }
53 
54     /**
55      * Gets an iterator of the objects associated with the
56      * longest prefix matching string key.
57      *
58      * @param text The text to be matched with prefixes.
59      * @return An iterator of the objects associated with
60      * the longest prefix matching matching key, or null
61      * if no matching entry is found.
62      */
get(String text)63     public Iterator<V> get(String text) {
64         return get(text, 0);
65     }
66 
67     /**
68      * Gets an iterator of the objects associated with the
69      * longest prefix matching string key starting at the
70      * specified position.
71      *
72      * @param text The text to be matched with prefixes.
73      * @param start The start index of of the text
74      * @return An iterator of the objects associated with the
75      * longest prefix matching matching key, or null if no
76      * matching entry is found.
77      */
get(CharSequence text, int start)78     public Iterator<V> get(CharSequence text, int start) {
79         return get(text, start, null);
80     }
81 
get(CharSequence text, int start, Output output)82     public Iterator<V> get(CharSequence text, int start, Output output) {
83         LongestMatchHandler<V> handler = new LongestMatchHandler<V>();
84         find(text, start, handler, output);
85         if (output != null) {
86             output.matchLength = handler.getMatchLength();
87         }
88         return handler.getMatches();
89     }
90 
find(CharSequence text, ResultHandler<V> handler)91     public void find(CharSequence text, ResultHandler<V> handler) {
92         find(text, 0, handler, null);
93     }
94 
find(CharSequence text, int offset, ResultHandler<V> handler)95     public void find(CharSequence text, int offset, ResultHandler<V> handler) {
96         find(text, offset, handler, null);
97     }
98 
find(CharSequence text, int offset, ResultHandler<V> handler, Output output)99     private void find(CharSequence text, int offset, ResultHandler<V> handler, Output output) {
100         CharIterator chitr = new CharIterator(text, offset, _ignoreCase);
101         find(_root, chitr, handler, output);
102     }
103 
find(Node node, CharIterator chitr, ResultHandler<V> handler, Output output)104     private synchronized void find(Node node, CharIterator chitr, ResultHandler<V> handler, Output output) {
105         Iterator<V> values = node.values();
106         if (values != null) {
107             if (!handler.handlePrefixMatch(chitr.processedLength(), values)) {
108                 return;
109             }
110         }
111 
112         Node nextMatch = node.findMatch(chitr, output);
113         if (nextMatch != null) {
114             find(nextMatch, chitr, handler, output);
115         }
116     }
117 
putLeadCodePoints(UnicodeSet output)118     public void putLeadCodePoints(UnicodeSet output) {
119         _root.putLeadCodePoints(output);
120     }
121 
122     public static class CharIterator implements Iterator<Character> {
123         private boolean _ignoreCase;
124         private CharSequence _text;
125         private int _nextIdx;
126         private int _startIdx;
127 
128         private Character _remainingChar;
129 
CharIterator(CharSequence text, int offset, boolean ignoreCase)130         CharIterator(CharSequence text, int offset, boolean ignoreCase) {
131             _text = text;
132             _nextIdx = _startIdx = offset;
133             _ignoreCase = ignoreCase;
134         }
135 
136         /* (non-Javadoc)
137          * @see java.util.Iterator#hasNext()
138          */
139         @Override
hasNext()140         public boolean hasNext() {
141             if (_nextIdx == _text.length() && _remainingChar == null) {
142                 return false;
143             }
144             return true;
145         }
146 
147         /* (non-Javadoc)
148          * @see java.util.Iterator#next()
149          */
150         @Override
next()151         public Character next() {
152             if (_nextIdx == _text.length() && _remainingChar == null) {
153                 return null;
154             }
155             Character next;
156             if (_remainingChar != null) {
157                 next = _remainingChar;
158                 _remainingChar = null;
159             } else {
160                 if (_ignoreCase) {
161                     int cp = UCharacter.foldCase(Character.codePointAt(_text, _nextIdx), true);
162                     _nextIdx = _nextIdx + Character.charCount(cp);
163 
164                     char[] chars = Character.toChars(cp);
165                     next = chars[0];
166                     if (chars.length == 2) {
167                         _remainingChar = chars[1];
168                     }
169                 } else {
170                     next = _text.charAt(_nextIdx);
171                     _nextIdx++;
172                 }
173             }
174             return next;
175         }
176 
177         /* (non-Javadoc)
178          * @see java.util.Iterator#remove()
179          */
180         @Override
remove()181         public void remove() {
182             throw new UnsupportedOperationException("remove() not supproted");
183         }
184 
nextIndex()185         public int nextIndex() {
186             return _nextIdx;
187         }
188 
processedLength()189         public int processedLength() {
190             if (_remainingChar != null) {
191                 throw new IllegalStateException("In the middle of surrogate pair");
192             }
193             return _nextIdx - _startIdx;
194         }
195     }
196 
197     /**
198      * Callback handler for processing prefix matches used by
199      * find method.
200      */
201     public interface ResultHandler<V> {
202         /**
203          * Handles a prefix key match
204          *
205          * @param matchLength Matched key's length
206          * @param values An iterator of the objects associated with the matched key
207          * @return Return true to continue the search in the trie, false to quit.
208          */
handlePrefixMatch(int matchLength, Iterator<V> values)209         public boolean handlePrefixMatch(int matchLength, Iterator<V> values);
210     }
211 
212     private static class LongestMatchHandler<V> implements ResultHandler<V> {
213         private Iterator<V> matches = null;
214         private int length = 0;
215 
216         @Override
handlePrefixMatch(int matchLength, Iterator<V> values)217         public boolean handlePrefixMatch(int matchLength, Iterator<V> values) {
218             if (matchLength > length) {
219                 length = matchLength;
220                 matches = values;
221             }
222             return true;
223         }
224 
getMatches()225         public Iterator<V> getMatches() {
226             return matches;
227         }
228 
getMatchLength()229         public int getMatchLength() {
230             return length;
231         }
232     }
233 
234     /**
235      * Inner class representing a text node in the trie.
236      */
237     private class Node {
238         private char[] _text;
239         private List<V> _values;
240         private List<Node> _children;
241 
Node()242         private Node() {
243         }
244 
Node(char[] text, List<V> values, List<Node> children)245         private Node(char[] text, List<V> values, List<Node> children) {
246             _text = text;
247             _values = values;
248             _children = children;
249         }
250 
charCount()251         public int charCount() {
252           return _text == null ? 0 : _text.length;
253         }
254 
values()255         public Iterator<V> values() {
256             if (_values == null) {
257                 return null;
258             }
259             return _values.iterator();
260         }
261 
add(CharIterator chitr, V value)262         public void add(CharIterator chitr, V value) {
263             StringBuilder buf = new StringBuilder();
264             while (chitr.hasNext()) {
265                 buf.append(chitr.next());
266             }
267             add(toCharArray(buf), 0, value);
268         }
269 
findMatch(CharIterator chitr, Output output)270         public Node findMatch(CharIterator chitr, Output output) {
271             if (_children == null) {
272                 return null;
273             }
274             if (!chitr.hasNext()) {
275                 if (output != null) {
276                     output.partialMatch = true;
277                 }
278                 return null;
279             }
280             Node match = null;
281             Character ch = chitr.next();
282             for (Node child : _children) {
283                 if (ch < child._text[0]) {
284                     break;
285                 }
286                 if (ch == child._text[0]) {
287                     if (child.matchFollowing(chitr, output)) {
288                         match = child;
289                     }
290                     break;
291                 }
292             }
293             return match;
294         }
295 
putLeadCodePoints(UnicodeSet output)296         public void putLeadCodePoints(UnicodeSet output) {
297             if (_children == null) {
298                 return;
299             }
300             for (Node child : _children) {
301                 char c0 = child._text[0];
302                 if (!UCharacter.isHighSurrogate(c0)) {
303                     output.add(c0);
304                 } else if (child.charCount() >= 2) {
305                     output.add(Character.codePointAt(child._text, 0));
306                 } else if (child._children != null) {
307                     // Construct all possible code points from grandchildren.
308                     for (Node grandchild : child._children) {
309                         char c1 = grandchild._text[0];
310                         int cp = Character.toCodePoint(c0, c1);
311                         output.add(cp);
312                     }
313                 }
314             }
315         }
316 
add(char[] text, int offset, V value)317         private void add(char[] text, int offset, V value) {
318             if (text.length == offset) {
319                 _values = addValue(_values, value);
320                 return;
321             }
322 
323             if (_children == null) {
324                 _children = new LinkedList<Node>();
325                 Node child = new Node(subArray(text, offset), addValue(null, value), null);
326                 _children.add(child);
327                 return;
328             }
329 
330             // walk through children
331             ListIterator<Node> litr = _children.listIterator();
332             while (litr.hasNext()) {
333                 Node next = litr.next();
334                 if (text[offset] < next._text[0]) {
335                     litr.previous();
336                     break;
337                 }
338                 if (text[offset] == next._text[0]) {
339                     int matchLen = next.lenMatches(text, offset);
340                     if (matchLen == next._text.length) {
341                         // full match
342                         next.add(text, offset + matchLen, value);
343                     } else {
344                         // partial match, create a branch
345                         next.split(matchLen);
346                         next.add(text, offset + matchLen, value);
347                     }
348                     return;
349                 }
350             }
351             // add a new child to this node
352             litr.add(new Node(subArray(text, offset), addValue(null, value), null));
353         }
354 
matchFollowing(CharIterator chitr, Output output)355         private boolean matchFollowing(CharIterator chitr, Output output) {
356             boolean matched = true;
357             int idx = 1;
358             while (idx < _text.length) {
359                 if(!chitr.hasNext()) {
360                     if (output != null) {
361                         output.partialMatch = true;
362                     }
363                     matched = false;
364                     break;
365                 }
366                 Character ch = chitr.next();
367                 if (ch != _text[idx]) {
368                     matched = false;
369                     break;
370                 }
371                 idx++;
372             }
373             return matched;
374         }
375 
lenMatches(char[] text, int offset)376         private int lenMatches(char[] text, int offset) {
377             int textLen = text.length - offset;
378             int limit = _text.length < textLen ? _text.length : textLen;
379             int len = 0;
380             while (len < limit) {
381                 if (_text[len] != text[offset + len]) {
382                     break;
383                 }
384                 len++;
385             }
386             return len;
387         }
388 
389         private void split(int offset) {
390             // split the current node at the offset
391             char[] childText = subArray(_text, offset);
392             _text = subArray(_text, 0, offset);
393 
394             // add the Node representing after the offset as a child
395             Node child = new Node(childText, _values, _children);
396             _values = null;
397 
398             _children = new LinkedList<Node>();
399             _children.add(child);
400         }
401 
402         private List<V> addValue(List<V> list, V value) {
403             if (list == null) {
404                 list = new LinkedList<V>();
405             }
406             list.add(value);
407             return list;
408         }
409     }
410 
411     private static char[] toCharArray(CharSequence text) {
412         char[] array = new char[text.length()];
413         for (int i = 0; i < array.length; i++) {
414             array[i] = text.charAt(i);
415         }
416         return array;
417     }
418 
419     private static char[] subArray(char[] array, int start) {
420         if (start == 0) {
421             return array;
422         }
423         char[] sub = new char[array.length - start];
424         System.arraycopy(array, start, sub, 0, sub.length);
425         return sub;
426     }
427 
428     private static char[] subArray(char[] array, int start, int limit) {
429         if (start == 0 && limit == array.length) {
430             return array;
431         }
432         char[] sub = new char[limit - start];
433         System.arraycopy(array, start, sub, 0, limit - start);
434         return sub;
435     }
436 }
437