1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2014, International Business Machines Corporation and         *
6  * others. All Rights Reserved.                                                *
7  *******************************************************************************
8  */
9 package com.ibm.icu.text;
10 
11 import java.text.CharacterIterator;
12 
13 import com.ibm.icu.impl.CharacterIteration;
14 
15 abstract class DictionaryBreakEngine implements LanguageBreakEngine {
16 
17     /* Helper class for improving readability of the Thai/Lao/Khmer word break
18      * algorithm.
19      */
20     static class PossibleWord {
21         // List size, limited by the maximum number of words in the dictionary
22         // that form a nested sequence.
23         private final static int POSSIBLE_WORD_LIST_MAX = 20;
24         //list of word candidate lengths, in increasing length order
25         private int lengths[];
26         private int count[];    // Count of candidates
27         private int prefix;     // The longest match with a dictionary word
28         private int offset;     // Offset in the text of these candidates
29         private int mark;       // The preferred candidate's offset
30         private int current;    // The candidate we're currently looking at
31 
32         // Default constructor
PossibleWord()33         public PossibleWord() {
34             lengths = new int[POSSIBLE_WORD_LIST_MAX];
35             count = new int[1]; // count needs to be an array of 1 so that it can be pass as reference
36             offset = -1;
37         }
38 
39         // Fill the list of candidates if needed, select the longest, and return the number found
candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd)40         public int candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) {
41             int start = fIter.getIndex();
42             if (start != offset) {
43                 offset = start;
44                 prefix = dict.matches(fIter, rangeEnd - start, lengths, count, lengths.length);
45                 // Dictionary leaves text after longest prefix, not longest word. Back up.
46                 if (count[0] <= 0) {
47                     fIter.setIndex(start);
48                 }
49             }
50             if (count[0] > 0) {
51                 fIter.setIndex(start + lengths[count[0]-1]);
52             }
53             current = count[0] - 1;
54             mark = current;
55             return count[0];
56         }
57 
58         // Select the currently marked candidate, point after it in the text, and invalidate self
acceptMarked(CharacterIterator fIter)59         public int acceptMarked(CharacterIterator fIter) {
60             fIter.setIndex(offset + lengths[mark]);
61             return lengths[mark];
62         }
63 
64         // Backup from the current candidate to the next shorter one; return true if that exists
65         // and point the text after it
backUp(CharacterIterator fIter)66         public boolean backUp(CharacterIterator fIter) {
67             if (current > 0) {
68                 fIter.setIndex(offset + lengths[--current]);
69                 return true;
70             }
71             return false;
72         }
73 
74         // Return the longest prefix this candidate location shares with a dictionary word
longestPrefix()75         public int longestPrefix() {
76             return prefix;
77         }
78 
79         // Mark the current candidate as the one we like
markCurrent()80         public void markCurrent() {
81             mark = current;
82         }
83     }
84 
85     /**
86      *  A deque-like structure holding raw ints.
87      *  Partial, limited implementation, only what is needed by the dictionary implementation.
88      *  For internal use only.
89      * @internal
90      */
91     static class DequeI implements Cloneable {
92         private int[] data = new int[50];
93         private int lastIdx = 4;   // or base of stack. Index of element.
94         private int firstIdx = 4;  // or Top of Stack. Index of element + 1.
95 
96         @Override
clone()97         public Object clone() throws CloneNotSupportedException {
98             DequeI result = (DequeI)super.clone();
99             result.data = data.clone();
100             return result;
101         }
102 
size()103         int size() {
104             return firstIdx - lastIdx;
105         }
106 
isEmpty()107         boolean isEmpty() {
108             return size() == 0;
109         }
110 
grow()111         private void grow() {
112             int[] newData = new int[data.length * 2];
113             System.arraycopy(data,  0,  newData,  0, data.length);
114             data = newData;
115         }
116 
offer(int v)117         void offer(int v) {
118             // Note that the actual use cases of offer() add at most one element.
119             //   We make no attempt to handle more than a few.
120             assert lastIdx > 0;
121             data[--lastIdx] = v;
122         }
123 
push(int v)124         void push(int v) {
125             if (firstIdx >= data.length) {
126                 grow();
127             }
128             data[firstIdx++] = v;
129         }
130 
pop()131         int pop() {
132             assert size() > 0;
133             return data[--firstIdx];
134         }
135 
peek()136         int peek() {
137             assert size() > 0;
138             return data[firstIdx - 1];
139         }
140 
peekLast()141         int peekLast() {
142             assert size() > 0;
143             return data[lastIdx];
144         }
145 
pollLast()146         int pollLast() {
147             assert size() > 0;
148             return data[lastIdx++];
149         }
150 
contains(int v)151         boolean contains(int v) {
152             for (int i=lastIdx; i< firstIdx; i++) {
153                 if (data[i] == v) {
154                     return true;
155                 }
156             }
157             return false;
158         }
159 
elementAt(int i)160         int elementAt(int i) {
161             assert i < size();
162             return data[lastIdx + i];
163         }
164 
165         void removeAllElements() {
166             lastIdx = firstIdx = 4;
167         }
168     }
169 
170     UnicodeSet fSet = new UnicodeSet();
171 
172     /**
173      *  Constructor
174      */
175     public DictionaryBreakEngine() {
176     }
177 
178     @Override
179     public boolean handles(int c) {
180         return fSet.contains(c);        // we recognize the character
181     }
182 
183     @Override
184     public int findBreaks(CharacterIterator text, int startPos, int endPos,
185             DequeI foundBreaks) {
186         int result = 0;
187 
188          // Find the span of characters included in the set.
189          //   The span to break begins at the current position int the text, and
190          //   extends towards the start or end of the text, depending on 'reverse'.
191 
192         int start = text.getIndex();
193         int current;
194         int rangeStart;
195         int rangeEnd;
196         int c = CharacterIteration.current32(text);
197         while ((current = text.getIndex()) < endPos && fSet.contains(c)) {
198             CharacterIteration.next32(text);
199             c = CharacterIteration.current32(text);
200         }
201         rangeStart = start;
202         rangeEnd = current;
203 
204         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
205         text.setIndex(current);
206 
207         return result;
208     }
209 
210     void setCharacters(UnicodeSet set) {
211         fSet = new UnicodeSet(set);
212         fSet.compact();
213     }
214 
215     /**
216      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
217      *
218      * @param text A UText representing the text
219      * @param rangeStart The start of the range of dictionary characters
220      * @param rangeEnd The end of the range of dictionary characters
221      * @param foundBreaks Output of break positions. Positions are pushed.
222      *                    Pre-existing contents of the output stack are unaltered.
223      * @return The number of breaks found
224      */
225      abstract int divideUpDictionaryRange(CharacterIterator text,
226                                           int               rangeStart,
227                                           int               rangeEnd,
228                                           DequeI            foundBreaks );
229 }
230