1 /*
2 **********************************************************************
3 *   Copyright (C) 2008-2015, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   05/11/2008  Andy Heninger  Port from Java
8 **********************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
14 
15 #include "unicode/brkiter.h"
16 #include "unicode/localpointer.h"
17 #include "unicode/uchar.h"
18 #include "unicode/unifilt.h"
19 #include "unicode/uniset.h"
20 
21 #include "brktrans.h"
22 #include "cmemory.h"
23 #include "mutex.h"
24 #include "uprops.h"
25 #include "uinvchar.h"
26 #include "util.h"
27 #include "uvectr32.h"
28 
29 U_NAMESPACE_BEGIN
30 
31 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
32 
33 static const UChar SPACE       = 32;  // ' '
34 
35 
36 /**
37  * Constructs a transliterator with the default delimiters '{' and
38  * '}'.
39  */
BreakTransliterator(UnicodeFilter * adoptedFilter)40 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
41         Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
42         cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
43     }
44 
45 
46 /**
47  * Destructor.
48  */
~BreakTransliterator()49 BreakTransliterator::~BreakTransliterator() {
50 }
51 
52 /**
53  * Copy constructor.
54  */
BreakTransliterator(const BreakTransliterator & o)55 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
56         Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
57 }
58 
59 
60 /**
61  * Transliterator API.
62  */
clone(void) const63 Transliterator* BreakTransliterator::clone(void) const {
64     return new BreakTransliterator(*this);
65 }
66 
67 /**
68  * Implements {@link Transliterator#handleTransliterate}.
69  */
handleTransliterate(Replaceable & text,UTransPosition & offsets,UBool isIncremental) const70 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
71                                                     UBool isIncremental ) const {
72 
73         UErrorCode status = U_ZERO_ERROR;
74         LocalPointer<BreakIterator> bi;
75         LocalPointer<UVector32> boundaries;
76 
77         {
78             Mutex m;
79             BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
80             boundaries.moveFrom(nonConstThis->cachedBoundaries);
81             bi.moveFrom(nonConstThis->cachedBI);
82         }
83         if (bi.isNull()) {
84             bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
85         }
86         if (boundaries.isNull()) {
87             boundaries.adoptInstead(new UVector32(status));
88         }
89 
90         if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
91             return;
92         }
93 
94         boundaries->removeAllElements();
95         UnicodeString sText = replaceableAsString(text);
96         bi->setText(sText);
97         bi->preceding(offsets.start);
98 
99         // To make things much easier, we will stack the boundaries, and then insert at the end.
100         // generally, we won't need too many, since we will be filtered.
101 
102         int32_t boundary;
103         for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
104             if (boundary == 0) continue;
105             // HACK: Check to see that preceeding item was a letter
106 
107             UChar32 cp = sText.char32At(boundary-1);
108             int type = u_charType(cp);
109             //System.out.println(Integer.toString(cp,16) + " (before): " + type);
110             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
111 
112             cp = sText.char32At(boundary);
113             type = u_charType(cp);
114             //System.out.println(Integer.toString(cp,16) + " (after): " + type);
115             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
116 
117             boundaries->addElement(boundary, status);
118             // printf("Boundary at %d\n", boundary);
119         }
120 
121         int delta = 0;
122         int lastBoundary = 0;
123 
124         if (boundaries->size() != 0) { // if we found something, adjust
125             delta = boundaries->size() * fInsertion.length();
126             lastBoundary = boundaries->lastElementi();
127 
128             // we do this from the end backwards, so that we don't have to keep updating.
129 
130             while (boundaries->size() > 0) {
131                 boundary = boundaries->popi();
132                 text.handleReplaceBetween(boundary, boundary, fInsertion);
133             }
134         }
135 
136         // Now fix up the return values
137         offsets.contextLimit += delta;
138         offsets.limit += delta;
139         offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
140 
141         // Return break iterator & boundaries vector to the cache.
142         {
143             Mutex m;
144             BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
145             if (nonConstThis->cachedBI.isNull()) {
146                 nonConstThis->cachedBI.moveFrom(bi);
147             }
148             if (nonConstThis->cachedBoundaries.isNull()) {
149                 nonConstThis->cachedBoundaries.moveFrom(boundaries);
150             }
151         }
152 
153         // TODO:  do something with U_FAILURE(status);
154         //        (need to look at transliterators overall, not just here.)
155 }
156 
157 //
158 //  getInsertion()
159 //
getInsertion() const160 const UnicodeString &BreakTransliterator::getInsertion() const {
161     return fInsertion;
162 }
163 
164 //
165 //  setInsertion()
166 //
setInsertion(const UnicodeString & insertion)167 void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
168     this->fInsertion = insertion;
169 }
170 
171 //
172 //   replaceableAsString   Hack to let break iterators work
173 //                         on the replaceable text from transliterators.
174 //                         In practice, the only real Replaceable type that we
175 //                         will be seeing is UnicodeString, so this function
176 //                         will normally be efficient.
177 //
replaceableAsString(Replaceable & r)178 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
179     UnicodeString s;
180     UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
181     if (rs != NULL) {
182         s = *rs;
183     } else {
184         r.extractBetween(0, r.length(), s);
185     }
186     return s;
187 }
188 
189 U_NAMESPACE_END
190 
191 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
192