1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /**
18  * A wrapper around ICU's line break iterator, that gives customized line
19  * break opportunities, as well as identifying words for the purpose of
20  * hyphenation.
21  */
22 
23 #ifndef MINIKIN_WORD_BREAKER_H
24 #define MINIKIN_WORD_BREAKER_H
25 
26 #include "unicode/brkiter.h"
27 #include <memory>
28 
29 namespace minikin {
30 
31 class WordBreaker {
32 public:
~WordBreaker()33     ~WordBreaker() {
34         finish();
35     }
36 
37     void setLocale(const icu::Locale& locale);
38 
39     void setText(const uint16_t* data, size_t size);
40 
41     // Advance iterator to next word break. Return offset, or -1 if EOT
42     ssize_t next();
43 
44     // Current offset of iterator, equal to 0 at BOT or last return from next()
45     ssize_t current() const;
46 
47     // After calling next(), wordStart() and wordEnd() are offsets defining the previous
48     // word. If wordEnd <= wordStart, it's not a word for the purpose of hyphenation.
49     ssize_t wordStart() const;
50 
51     ssize_t wordEnd() const;
52 
53     int breakBadness() const;
54 
55     void finish();
56 
57 private:
58     int32_t iteratorNext();
59     void detectEmailOrUrl();
60     ssize_t findNextBreakInEmailOrUrl();
61 
62     std::unique_ptr<icu::BreakIterator> mBreakIterator;
63     UText mUText = UTEXT_INITIALIZER;
64     const uint16_t* mText = nullptr;
65     size_t mTextSize;
66     ssize_t mLast;
67     ssize_t mCurrent;
68     bool mIteratorWasReset;
69 
70     // state for the email address / url detector
71     ssize_t mScanOffset;
72     bool mInEmailOrUrl;
73 };
74 
75 }  // namespace minikin
76 
77 #endif  // MINIKIN_WORD_BREAKER_H
78