1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2008-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   05/11/2008  Andy Heninger  Port from Java
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 
15 #if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
16 
17 #include "unicode/brkiter.h"
18 #include "unicode/localpointer.h"
19 #include "unicode/uchar.h"
20 #include "unicode/unifilt.h"
21 #include "unicode/uniset.h"
22 
23 #include "brktrans.h"
24 #include "cmemory.h"
25 #include "mutex.h"
26 #include "uprops.h"
27 #include "uinvchar.h"
28 #include "util.h"
29 #include "uvectr32.h"
30 
31 U_NAMESPACE_BEGIN
32 
33 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
34 
35 static const UChar SPACE       = 32;  // ' '
36 
37 
38 /**
39  * Constructs a transliterator with the default delimiters '{' and
40  * '}'.
41  */
BreakTransliterator(UnicodeFilter * adoptedFilter)42 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
43         Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
44         cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
45     }
46 
47 
48 /**
49  * Destructor.
50  */
~BreakTransliterator()51 BreakTransliterator::~BreakTransliterator() {
52 }
53 
54 /**
55  * Copy constructor.
56  */
BreakTransliterator(const BreakTransliterator & o)57 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
58         Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
59 }
60 
61 
62 /**
63  * Transliterator API.
64  */
clone(void) const65 Transliterator* BreakTransliterator::clone(void) const {
66     return new BreakTransliterator(*this);
67 }
68 
69 /**
70  * Implements {@link Transliterator#handleTransliterate}.
71  */
handleTransliterate(Replaceable & text,UTransPosition & offsets,UBool isIncremental) const72 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
73                                                     UBool isIncremental ) const {
74 
75         UErrorCode status = U_ZERO_ERROR;
76         LocalPointer<BreakIterator> bi;
77         LocalPointer<UVector32> boundaries;
78 
79         {
80             Mutex m;
81             BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
82             boundaries.moveFrom(nonConstThis->cachedBoundaries);
83             bi.moveFrom(nonConstThis->cachedBI);
84         }
85         if (bi.isNull()) {
86             bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
87         }
88         if (boundaries.isNull()) {
89             boundaries.adoptInstead(new UVector32(status));
90         }
91 
92         if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
93             return;
94         }
95 
96         boundaries->removeAllElements();
97         UnicodeString sText = replaceableAsString(text);
98         bi->setText(sText);
99         bi->preceding(offsets.start);
100 
101         // To make things much easier, we will stack the boundaries, and then insert at the end.
102         // generally, we won't need too many, since we will be filtered.
103 
104         int32_t boundary;
105         for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
106             if (boundary == 0) continue;
107             // HACK: Check to see that preceeding item was a letter
108 
109             UChar32 cp = sText.char32At(boundary-1);
110             int type = u_charType(cp);
111             //System.out.println(Integer.toString(cp,16) + " (before): " + type);
112             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
113 
114             cp = sText.char32At(boundary);
115             type = u_charType(cp);
116             //System.out.println(Integer.toString(cp,16) + " (after): " + type);
117             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
118 
119             boundaries->addElement(boundary, status);
120             // printf("Boundary at %d\n", boundary);
121         }
122 
123         int delta = 0;
124         int lastBoundary = 0;
125 
126         if (boundaries->size() != 0) { // if we found something, adjust
127             delta = boundaries->size() * fInsertion.length();
128             lastBoundary = boundaries->lastElementi();
129 
130             // we do this from the end backwards, so that we don't have to keep updating.
131 
132             while (boundaries->size() > 0) {
133                 boundary = boundaries->popi();
134                 text.handleReplaceBetween(boundary, boundary, fInsertion);
135             }
136         }
137 
138         // Now fix up the return values
139         offsets.contextLimit += delta;
140         offsets.limit += delta;
141         offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
142 
143         // Return break iterator & boundaries vector to the cache.
144         {
145             Mutex m;
146             BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
147             if (nonConstThis->cachedBI.isNull()) {
148                 nonConstThis->cachedBI.moveFrom(bi);
149             }
150             if (nonConstThis->cachedBoundaries.isNull()) {
151                 nonConstThis->cachedBoundaries.moveFrom(boundaries);
152             }
153         }
154 
155         // TODO:  do something with U_FAILURE(status);
156         //        (need to look at transliterators overall, not just here.)
157 }
158 
159 //
160 //  getInsertion()
161 //
getInsertion() const162 const UnicodeString &BreakTransliterator::getInsertion() const {
163     return fInsertion;
164 }
165 
166 //
167 //  setInsertion()
168 //
setInsertion(const UnicodeString & insertion)169 void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
170     this->fInsertion = insertion;
171 }
172 
173 //
174 //   replaceableAsString   Hack to let break iterators work
175 //                         on the replaceable text from transliterators.
176 //                         In practice, the only real Replaceable type that we
177 //                         will be seeing is UnicodeString, so this function
178 //                         will normally be efficient.
179 //
replaceableAsString(Replaceable & r)180 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
181     UnicodeString s;
182     UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
183     if (rs != NULL) {
184         s = *rs;
185     } else {
186         r.extractBetween(0, r.length(), s);
187     }
188     return s;
189 }
190 
191 U_NAMESPACE_END
192 
193 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
194