1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2014, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.text.CharacterIterator; 12 13 import com.ibm.icu.impl.CharacterIteration; 14 15 abstract class DictionaryBreakEngine implements LanguageBreakEngine { 16 17 /* Helper class for improving readability of the Thai/Lao/Khmer word break 18 * algorithm. 19 */ 20 static class PossibleWord { 21 // List size, limited by the maximum number of words in the dictionary 22 // that form a nested sequence. 23 private final static int POSSIBLE_WORD_LIST_MAX = 20; 24 //list of word candidate lengths, in increasing length order 25 private int lengths[]; 26 private int count[]; // Count of candidates 27 private int prefix; // The longest match with a dictionary word 28 private int offset; // Offset in the text of these candidates 29 private int mark; // The preferred candidate's offset 30 private int current; // The candidate we're currently looking at 31 32 // Default constructor PossibleWord()33 public PossibleWord() { 34 lengths = new int[POSSIBLE_WORD_LIST_MAX]; 35 count = new int[1]; // count needs to be an array of 1 so that it can be pass as reference 36 offset = -1; 37 } 38 39 // Fill the list of candidates if needed, select the longest, and return the number found candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd)40 public int candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) { 41 int start = fIter.getIndex(); 42 if (start != offset) { 43 offset = start; 44 prefix = dict.matches(fIter, rangeEnd - start, lengths, count, lengths.length); 45 // Dictionary leaves text after longest prefix, not longest word. Back up. 46 if (count[0] <= 0) { 47 fIter.setIndex(start); 48 } 49 } 50 if (count[0] > 0) { 51 fIter.setIndex(start + lengths[count[0]-1]); 52 } 53 current = count[0] - 1; 54 mark = current; 55 return count[0]; 56 } 57 58 // Select the currently marked candidate, point after it in the text, and invalidate self acceptMarked(CharacterIterator fIter)59 public int acceptMarked(CharacterIterator fIter) { 60 fIter.setIndex(offset + lengths[mark]); 61 return lengths[mark]; 62 } 63 64 // Backup from the current candidate to the next shorter one; return true if that exists 65 // and point the text after it backUp(CharacterIterator fIter)66 public boolean backUp(CharacterIterator fIter) { 67 if (current > 0) { 68 fIter.setIndex(offset + lengths[--current]); 69 return true; 70 } 71 return false; 72 } 73 74 // Return the longest prefix this candidate location shares with a dictionary word longestPrefix()75 public int longestPrefix() { 76 return prefix; 77 } 78 79 // Mark the current candidate as the one we like markCurrent()80 public void markCurrent() { 81 mark = current; 82 } 83 } 84 85 /** 86 * A deque-like structure holding raw ints. 87 * Partial, limited implementation, only what is needed by the dictionary implementation. 88 * For internal use only. 89 * @internal 90 */ 91 static class DequeI implements Cloneable { 92 private int[] data = new int[50]; 93 private int lastIdx = 4; // or base of stack. Index of element. 94 private int firstIdx = 4; // or Top of Stack. Index of element + 1. 95 96 @Override clone()97 public Object clone() throws CloneNotSupportedException { 98 DequeI result = (DequeI)super.clone(); 99 result.data = data.clone(); 100 return result; 101 } 102 size()103 int size() { 104 return firstIdx - lastIdx; 105 } 106 isEmpty()107 boolean isEmpty() { 108 return size() == 0; 109 } 110 grow()111 private void grow() { 112 int[] newData = new int[data.length * 2]; 113 System.arraycopy(data, 0, newData, 0, data.length); 114 data = newData; 115 } 116 offer(int v)117 void offer(int v) { 118 // Note that the actual use cases of offer() add at most one element. 119 // We make no attempt to handle more than a few. 120 assert lastIdx > 0; 121 data[--lastIdx] = v; 122 } 123 push(int v)124 void push(int v) { 125 if (firstIdx >= data.length) { 126 grow(); 127 } 128 data[firstIdx++] = v; 129 } 130 pop()131 int pop() { 132 assert size() > 0; 133 return data[--firstIdx]; 134 } 135 peek()136 int peek() { 137 assert size() > 0; 138 return data[firstIdx - 1]; 139 } 140 peekLast()141 int peekLast() { 142 assert size() > 0; 143 return data[lastIdx]; 144 } 145 pollLast()146 int pollLast() { 147 assert size() > 0; 148 return data[lastIdx++]; 149 } 150 contains(int v)151 boolean contains(int v) { 152 for (int i=lastIdx; i< firstIdx; i++) { 153 if (data[i] == v) { 154 return true; 155 } 156 } 157 return false; 158 } 159 elementAt(int i)160 int elementAt(int i) { 161 assert i < size(); 162 return data[lastIdx + i]; 163 } 164 165 void removeAllElements() { 166 lastIdx = firstIdx = 4; 167 } 168 } 169 170 UnicodeSet fSet = new UnicodeSet(); 171 172 /** 173 * Constructor 174 */ 175 public DictionaryBreakEngine() { 176 } 177 178 @Override 179 public boolean handles(int c) { 180 return fSet.contains(c); // we recognize the character 181 } 182 183 @Override 184 public int findBreaks(CharacterIterator text, int startPos, int endPos, 185 DequeI foundBreaks) { 186 int result = 0; 187 188 // Find the span of characters included in the set. 189 // The span to break begins at the current position int the text, and 190 // extends towards the start or end of the text, depending on 'reverse'. 191 192 int start = text.getIndex(); 193 int current; 194 int rangeStart; 195 int rangeEnd; 196 int c = CharacterIteration.current32(text); 197 while ((current = text.getIndex()) < endPos && fSet.contains(c)) { 198 CharacterIteration.next32(text); 199 c = CharacterIteration.current32(text); 200 } 201 rangeStart = start; 202 rangeEnd = current; 203 204 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); 205 text.setIndex(current); 206 207 return result; 208 } 209 210 void setCharacters(UnicodeSet set) { 211 fSet = new UnicodeSet(set); 212 fSet.compact(); 213 } 214 215 /** 216 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 217 * 218 * @param text A UText representing the text 219 * @param rangeStart The start of the range of dictionary characters 220 * @param rangeEnd The end of the range of dictionary characters 221 * @param foundBreaks Output of break positions. Positions are pushed. 222 * Pre-existing contents of the output stack are unaltered. 223 * @return The number of breaks found 224 */ 225 abstract int divideUpDictionaryRange(CharacterIterator text, 226 int rangeStart, 227 int rangeEnd, 228 DequeI foundBreaks ); 229 } 230