1 /*
2 **********************************************************************
3 *   Copyright (C) 2008-2010, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   05/11/2008  Andy Heninger  Port from Java
8 **********************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
14 
15 #include "unicode/unifilt.h"
16 #include "unicode/uchar.h"
17 #include "unicode/uniset.h"
18 #include "unicode/brkiter.h"
19 #include "brktrans.h"
20 #include "unicode/uchar.h"
21 #include "cmemory.h"
22 #include "uprops.h"
23 #include "uinvchar.h"
24 #include "util.h"
25 #include "uvectr32.h"
26 
27 U_NAMESPACE_BEGIN
28 
29 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
30 
31 static const UChar SPACE       = 32;  // ' '
32 
33 
34 /**
35  * Constructs a transliterator with the default delimiters '{' and
36  * '}'.
37  */
BreakTransliterator(UnicodeFilter * adoptedFilter)38 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
39     Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
40     fInsertion(SPACE) {
41         bi = NULL;
42         UErrorCode status = U_ZERO_ERROR;
43         boundaries = new UVector32(status);
44     }
45 
46 
47 /**
48  * Destructor.
49  */
~BreakTransliterator()50 BreakTransliterator::~BreakTransliterator() {
51     delete bi;
52     bi = NULL;
53     delete boundaries;
54     boundaries = NULL;
55 }
56 
57 /**
58  * Copy constructor.
59  */
BreakTransliterator(const BreakTransliterator & o)60 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
61     Transliterator(o) {
62         bi = NULL;
63         if (o.bi != NULL) {
64             bi = o.bi->clone();
65         }
66         fInsertion = o.fInsertion;
67         UErrorCode status = U_ZERO_ERROR;
68         boundaries = new UVector32(status);
69     }
70 
71 
72 /**
73  * Transliterator API.
74  */
clone(void) const75 Transliterator* BreakTransliterator::clone(void) const {
76     return new BreakTransliterator(*this);
77 }
78 
79 /**
80  * Implements {@link Transliterator#handleTransliterate}.
81  */
handleTransliterate(Replaceable & text,UTransPosition & offsets,UBool isIncremental) const82 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
83                                                     UBool isIncremental ) const {
84 
85         UErrorCode status = U_ZERO_ERROR;
86         boundaries->removeAllElements();
87         BreakTransliterator *nonConstThis = (BreakTransliterator *)this;
88         nonConstThis->getBreakIterator(); // Lazy-create it if necessary
89         UnicodeString sText = replaceableAsString(text);
90         bi->setText(sText);
91         bi->preceding(offsets.start);
92 
93         // To make things much easier, we will stack the boundaries, and then insert at the end.
94         // generally, we won't need too many, since we will be filtered.
95 
96         int32_t boundary;
97         for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
98             if (boundary == 0) continue;
99             // HACK: Check to see that preceeding item was a letter
100 
101             UChar32 cp = sText.char32At(boundary-1);
102             int type = u_charType(cp);
103             //System.out.println(Integer.toString(cp,16) + " (before): " + type);
104             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
105 
106             cp = sText.char32At(boundary);
107             type = u_charType(cp);
108             //System.out.println(Integer.toString(cp,16) + " (after): " + type);
109             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
110 
111             boundaries->addElement(boundary, status);
112             // printf("Boundary at %d\n", boundary);
113         }
114 
115         int delta = 0;
116         int lastBoundary = 0;
117 
118         if (boundaries->size() != 0) { // if we found something, adjust
119             delta = boundaries->size() * fInsertion.length();
120             lastBoundary = boundaries->lastElementi();
121 
122             // we do this from the end backwards, so that we don't have to keep updating.
123 
124             while (boundaries->size() > 0) {
125                 boundary = boundaries->popi();
126                 text.handleReplaceBetween(boundary, boundary, fInsertion);
127             }
128         }
129 
130         // Now fix up the return values
131         offsets.contextLimit += delta;
132         offsets.limit += delta;
133         offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
134 
135         // TODO:  do something with U_FAILURE(status);
136         //        (need to look at transliterators overall, not just here.)
137 }
138 
139 //
140 //  getInsertion()
141 //
getInsertion() const142 const UnicodeString &BreakTransliterator::getInsertion() const {
143     return fInsertion;
144 }
145 
146 //
147 //  setInsertion()
148 //
setInsertion(const UnicodeString & insertion)149 void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
150     this->fInsertion = insertion;
151 }
152 
153 //
154 //  getBreakIterator     Lazily create the break iterator if it does
155 //                       not already exist.  Copied from Java, probably
156 //                       better to just create it in the constructor.
157 //
getBreakIterator()158 BreakIterator *BreakTransliterator::getBreakIterator() {
159     UErrorCode status = U_ZERO_ERROR;
160     if (bi == NULL) {
161         // Note:  Thai breaking behavior is universal, it is not
162         //        tied to the Thai locale.
163         bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
164     }
165     return bi;
166 }
167 
168 //
169 //   replaceableAsString   Hack to let break iterators work
170 //                         on the replaceable text from transliterators.
171 //                         In practice, the only real Replaceable type that we
172 //                         will be seeing is UnicodeString, so this function
173 //                         will normally be efficient.
174 //
replaceableAsString(Replaceable & r)175 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
176     UnicodeString s;
177     UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
178     if (rs != NULL) {
179         s = *rs;
180     } else {
181         r.extractBetween(0, r.length(), s);
182     }
183     return s;
184 }
185 
186 U_NAMESPACE_END
187 
188 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
189