1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 1996-2014, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.text;
10 
11 import java.text.CharacterIterator;
12 
13 import com.ibm.icu.lang.UCharacter;
14 import com.ibm.icu.util.ICUCloneNotSupportedException;
15 import com.ibm.icu.util.ULocale;
16 
17 
18 /**
19  * Inserts the specified characters at word breaks. To restrict it to particular characters, use a filter.
20  * TODO: this is an internal class, and only temporary. Remove it once we have \b notation in Transliterator.
21  */
22 final class BreakTransliterator extends Transliterator {
23     private BreakIterator bi;
24     private String insertion;
25     private int[] boundaries = new int[50];
26     private int boundaryCount = 0;
27 
BreakTransliterator(String ID, UnicodeFilter filter, BreakIterator bi, String insertion)28     public BreakTransliterator(String ID, UnicodeFilter filter, BreakIterator bi, String insertion) {
29         super(ID, filter);
30         this.bi = bi;
31         this.insertion = insertion;
32     }
33 
BreakTransliterator(String ID, UnicodeFilter filter)34     public BreakTransliterator(String ID, UnicodeFilter filter) {
35         this(ID, filter, null, " ");
36     }
37 
38     ///CLOVER:OFF
39     // The following method is not called by anything and can't be reached
getInsertion()40     public String getInsertion() {
41         return insertion;
42     }
43     ///CLOVER:ON
44 
45     ///CLOVER:OFF
46     // The following method is not called by anything and can't be reached
setInsertion(String insertion)47     public void setInsertion(String insertion) {
48         this.insertion = insertion;
49     }
50     ///CLOVER:ON
51 
getBreakIterator()52     public BreakIterator getBreakIterator() {
53         // Defer initialization of BreakIterator because it is slow,
54         // typically over 2000 ms.
55         if (bi == null) bi = BreakIterator.getWordInstance(new ULocale("th_TH"));
56         return bi;
57     }
58 
59     ///CLOVER:OFF
60     // The following method is not called by anything and can't be reached
setBreakIterator(BreakIterator bi)61     public void setBreakIterator(BreakIterator bi) {
62         this.bi = bi;
63     }
64     ///CLOVER:ON
65 
66     static final int LETTER_OR_MARK_MASK =
67           (1<<Character.UPPERCASE_LETTER)
68         | (1<<Character.LOWERCASE_LETTER)
69         | (1<<Character.TITLECASE_LETTER)
70         | (1<<Character.MODIFIER_LETTER)
71         | (1<<Character.OTHER_LETTER)
72         | (1<<Character.COMBINING_SPACING_MARK)
73         | (1<<Character.NON_SPACING_MARK)
74         | (1<<Character.ENCLOSING_MARK)
75         ;
76     @Override
handleTransliterate(Replaceable text, Position pos, boolean incremental)77     protected synchronized void handleTransliterate(Replaceable text, Position pos, boolean incremental) {
78         boundaryCount = 0;
79         int boundary = 0;
80         getBreakIterator(); // Lazy-create it if necessary
81         bi.setText(new ReplaceableCharacterIterator(text, pos.start, pos.limit, pos.start));
82         // TODO: fix clumsy workaround used below.
83         /*
84         char[] tempBuffer = new char[text.length()];
85         text.getChars(0, text.length(), tempBuffer, 0);
86         bi.setText(new StringCharacterIterator(new String(tempBuffer), pos.start, pos.limit, pos.start));
87         */
88         // end debugging
89 
90         // To make things much easier, we will stack the boundaries, and then insert at the end.
91         // generally, we won't need too many, since we will be filtered.
92 
93         for(boundary = bi.first(); boundary != BreakIterator.DONE && boundary < pos.limit; boundary = bi.next()) {
94             if (boundary == 0) continue;
95             // HACK: Check to see that preceeding item was a letter
96 
97             int cp = UTF16.charAt(text, boundary-1);
98             int type = UCharacter.getType(cp);
99             //System.out.println(Integer.toString(cp,16) + " (before): " + type);
100             if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue;
101 
102             cp = UTF16.charAt(text, boundary);
103             type = UCharacter.getType(cp);
104             //System.out.println(Integer.toString(cp,16) + " (after): " + type);
105             if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue;
106 
107             if (boundaryCount >= boundaries.length) {       // realloc if necessary
108                 int[] temp = new int[boundaries.length * 2];
109                 System.arraycopy(boundaries, 0, temp, 0, boundaries.length);
110                 boundaries = temp;
111             }
112 
113             boundaries[boundaryCount++] = boundary;
114             //System.out.println(boundary);
115         }
116 
117         int delta = 0;
118         int lastBoundary = 0;
119 
120         if (boundaryCount != 0) { // if we found something, adjust
121             delta = boundaryCount * insertion.length();
122             lastBoundary = boundaries[boundaryCount-1];
123 
124             // we do this from the end backwards, so that we don't have to keep updating.
125 
126             while (boundaryCount > 0) {
127                 boundary = boundaries[--boundaryCount];
128                 text.replace(boundary, boundary, insertion);
129             }
130         }
131 
132         // Now fix up the return values
133         pos.contextLimit += delta;
134         pos.limit += delta;
135         pos.start = incremental ? lastBoundary + delta : pos.limit;
136     }
137 
138 
139     /**
140      * Registers standard variants with the system.  Called by
141      * Transliterator during initialization.
142      */
register()143     static void register() {
144         // false means that it is invisible
145         Transliterator trans = new BreakTransliterator("Any-BreakInternal", null);
146         Transliterator.registerInstance(trans, false);
147         /*
148         Transliterator.registerFactory("Any-Break", new Transliterator.Factory() {
149             public Transliterator getInstance(String ID) {
150                 return new BreakTransliterator("Any-Break", null);
151             }
152         });
153         */
154     }
155 
156     // Hack, just to get a real character iterator.
157     static final class ReplaceableCharacterIterator implements CharacterIterator
158     {
159         private Replaceable text;
160         private int begin;
161         private int end;
162         // invariant: begin <= pos <= end
163         private int pos;
164 
165         /**
166         * Constructs an iterator with an initial index of 0.
167         */
168         /*public ReplaceableCharacterIterator(Replaceable text)
169         {
170             this(text, 0);
171         }*/
172 
173         /**
174         * Constructs an iterator with the specified initial index.
175         *
176         * @param  text   The String to be iterated over
177         * @param  pos    Initial iterator position
178         */
179         /*public ReplaceableCharacterIterator(Replaceable text, int pos)
180         {
181             this(text, 0, text.length(), pos);
182         }*/
183 
184         /**
185         * Constructs an iterator over the given range of the given string, with the
186         * index set at the specified position.
187         *
188         * @param  text   The String to be iterated over
189         * @param  begin  Index of the first character
190         * @param  end    Index of the character following the last character
191         * @param  pos    Initial iterator position
192         */
ReplaceableCharacterIterator(Replaceable text, int begin, int end, int pos)193         public ReplaceableCharacterIterator(Replaceable text, int begin, int end, int pos) {
194             if (text == null) {
195                 throw new NullPointerException();
196             }
197             this.text = text;
198 
199             if (begin < 0 || begin > end || end > text.length()) {
200                 throw new IllegalArgumentException("Invalid substring range");
201             }
202 
203             if (pos < begin || pos > end) {
204                 throw new IllegalArgumentException("Invalid position");
205             }
206 
207             this.begin = begin;
208             this.end = end;
209             this.pos = pos;
210         }
211 
212         /**
213         * Reset this iterator to point to a new string.  This package-visible
214         * method is used by other java.text classes that want to avoid allocating
215         * new ReplaceableCharacterIterator objects every time their setText method
216         * is called.
217         *
218         * @param  text   The String to be iterated over
219         */
setText(Replaceable text)220         public void setText(Replaceable text) {
221             if (text == null) {
222                 throw new NullPointerException();
223             }
224             this.text = text;
225             this.begin = 0;
226             this.end = text.length();
227             this.pos = 0;
228         }
229 
230         /**
231         * Implements CharacterIterator.first() for String.
232         * @see CharacterIterator#first
233         */
234         @Override
first()235         public char first()
236         {
237             pos = begin;
238             return current();
239         }
240 
241         /**
242         * Implements CharacterIterator.last() for String.
243         * @see CharacterIterator#last
244         */
245         @Override
last()246         public char last()
247         {
248             if (end != begin) {
249                 pos = end - 1;
250             } else {
251                 pos = end;
252             }
253             return current();
254         }
255 
256         /**
257         * Implements CharacterIterator.setIndex() for String.
258         * @see CharacterIterator#setIndex
259         */
260         @Override
setIndex(int p)261         public char setIndex(int p)
262         {
263         if (p < begin || p > end) {
264                 throw new IllegalArgumentException("Invalid index");
265         }
266             pos = p;
267             return current();
268         }
269 
270         /**
271         * Implements CharacterIterator.current() for String.
272         * @see CharacterIterator#current
273         */
274         @Override
current()275         public char current()
276         {
277             if (pos >= begin && pos < end) {
278                 return text.charAt(pos);
279             }
280             else {
281                 return DONE;
282             }
283         }
284 
285         /**
286         * Implements CharacterIterator.next() for String.
287         * @see CharacterIterator#next
288         */
289         @Override
next()290         public char next()
291         {
292             if (pos < end - 1) {
293                 pos++;
294                 return text.charAt(pos);
295             }
296             else {
297                 pos = end;
298                 return DONE;
299             }
300         }
301 
302         /**
303         * Implements CharacterIterator.previous() for String.
304         * @see CharacterIterator#previous
305         */
306         @Override
previous()307         public char previous()
308         {
309             if (pos > begin) {
310                 pos--;
311                 return text.charAt(pos);
312             }
313             else {
314                 return DONE;
315             }
316         }
317 
318         /**
319         * Implements CharacterIterator.getBeginIndex() for String.
320         * @see CharacterIterator#getBeginIndex
321         */
322         @Override
getBeginIndex()323         public int getBeginIndex()
324         {
325             return begin;
326         }
327 
328         /**
329         * Implements CharacterIterator.getEndIndex() for String.
330         * @see CharacterIterator#getEndIndex
331         */
332         @Override
getEndIndex()333         public int getEndIndex()
334         {
335             return end;
336         }
337 
338         /**
339         * Implements CharacterIterator.getIndex() for String.
340         * @see CharacterIterator#getIndex
341         */
342         @Override
getIndex()343         public int getIndex()
344         {
345             return pos;
346         }
347 
348         /**
349         * Compares the equality of two ReplaceableCharacterIterator objects.
350         * @param obj the ReplaceableCharacterIterator object to be compared with.
351         * @return true if the given obj is the same as this
352         * ReplaceableCharacterIterator object; false otherwise.
353         */
354         @Override
equals(Object obj)355         public boolean equals(Object obj)
356         {
357             if (this == obj) {
358                 return true;
359             }
360             if (!(obj instanceof ReplaceableCharacterIterator)) {
361                 return false;
362             }
363 
364             ReplaceableCharacterIterator that = (ReplaceableCharacterIterator) obj;
365 
366             if (hashCode() != that.hashCode()) {
367                 return false;
368             }
369             if (!text.equals(that.text)) {
370                 return false;
371             }
372             if (pos != that.pos || begin != that.begin || end != that.end) {
373                 return false;
374             }
375             return true;
376         }
377 
378         /**
379         * Computes a hashcode for this iterator.
380         * @return A hash code
381         */
382         @Override
hashCode()383         public int hashCode()
384         {
385             return text.hashCode() ^ pos ^ begin ^ end;
386         }
387 
388         /**
389         * Creates a copy of this iterator.
390         * @return A copy of this
391         */
392         @Override
clone()393         public Object clone()
394         {
395             try {
396                 ReplaceableCharacterIterator other
397                 = (ReplaceableCharacterIterator) super.clone();
398                 return other;
399             }
400             catch (CloneNotSupportedException e) {
401                 throw new ICUCloneNotSupportedException();
402             }
403         }
404 
405     }
406     /* (non-Javadoc)
407      * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
408      */
409     @Override
addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet)410     public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
411         UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
412         // Doesn't actually modify the source characters, so leave them alone.
413         // add the characters inserted
414         if (myFilter.size() != 0) {
415             targetSet.addAll(insertion);
416         }
417     }
418 
419 }
420