1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /**
18  * A wrapper around ICU's line break iterator, that gives customized line
19  * break opportunities, as well as identifying words for the purpose of
20  * hyphenation.
21  */
22 
23 #ifndef MINIKIN_WORD_BREAKER_H
24 #define MINIKIN_WORD_BREAKER_H
25 
26 #include <list>
27 #include <mutex>
28 
29 #include <unicode/brkiter.h>
30 
31 #include "minikin/Macros.h"
32 #include "minikin/Range.h"
33 
34 #include "Locale.h"
35 
36 namespace minikin {
37 
38 // A class interface for providing pooling implementation of ICU's line breaker.
39 // The implementation can be customized for testing purposes.
40 class ICULineBreakerPool {
41 public:
42     struct Slot {
SlotSlot43         Slot() : localeId(0), breaker(nullptr) {}
SlotSlot44         Slot(uint64_t localeId, std::unique_ptr<icu::BreakIterator>&& breaker)
45                 : localeId(localeId), breaker(std::move(breaker)) {}
46 
47         Slot(Slot&& other) = default;
48         Slot& operator=(Slot&& other) = default;
49 
50         // Forbid copy and assignment.
51         Slot(const Slot&) = delete;
52         Slot& operator=(const Slot&) = delete;
53 
54         uint64_t localeId;
55         std::unique_ptr<icu::BreakIterator> breaker;
56     };
~ICULineBreakerPool()57     virtual ~ICULineBreakerPool() {}
58     virtual Slot acquire(const Locale& locale) = 0;
59     virtual void release(Slot&& slot) = 0;
60 };
61 
62 // An singleton implementation of the ICU line breaker pool.
63 // Since creating ICU line breaker instance takes some time. Pool it for later use.
64 class ICULineBreakerPoolImpl : public ICULineBreakerPool {
65 public:
66     Slot acquire(const Locale& locale) override;
67     void release(Slot&& slot) override;
68 
getInstance()69     static ICULineBreakerPoolImpl& getInstance() {
70         static ICULineBreakerPoolImpl pool;
71         return pool;
72     }
73 
74 protected:
75     // protected for testing purposes.
76     static constexpr size_t MAX_POOL_SIZE = 4;
ICULineBreakerPoolImpl()77     ICULineBreakerPoolImpl(){};  // singleton.
getPoolSize()78     size_t getPoolSize() const {
79         std::lock_guard<std::mutex> lock(mMutex);
80         return mPool.size();
81     }
82 
83 private:
84     std::list<Slot> mPool GUARDED_BY(mMutex);
85     mutable std::mutex mMutex;
86 };
87 
88 class WordBreaker {
89 public:
~WordBreaker()90     virtual ~WordBreaker() { finish(); }
91 
92     WordBreaker();
93 
94     void setText(const uint16_t* data, size_t size);
95 
96     // Advance iterator to next word break with current locale. Return offset, or -1 if EOT
97     ssize_t next();
98 
99     // Advance iterator to the break just after "from" with using the new provided locale.
100     // Return offset, or -1 if EOT
101     ssize_t followingWithLocale(const Locale& locale, size_t from);
102 
103     // Current offset of iterator, equal to 0 at BOT or last return from next()
104     ssize_t current() const;
105 
106     // After calling next(), wordStart() and wordEnd() are offsets defining the previous
107     // word. If wordEnd <= wordStart, it's not a word for the purpose of hyphenation.
108     ssize_t wordStart() const;
109 
110     ssize_t wordEnd() const;
111 
112     // Returns the range from wordStart() to wordEnd().
113     // If wordEnd() <= wordStart(), returns empty range.
wordRange()114     inline Range wordRange() const {
115         const uint32_t start = wordStart();
116         const uint32_t end = wordEnd();
117         return start < end ? Range(start, end) : Range(end, end);
118     }
119 
120     int breakBadness() const;
121 
122     void finish();
123 
124 protected:
125     // protected virtual for testing purpose.
126     // Caller must release the pool.
127     WordBreaker(ICULineBreakerPool* pool);
128 
129 private:
130     int32_t iteratorNext();
131     void detectEmailOrUrl();
132     ssize_t findNextBreakInEmailOrUrl();
133 
134     // Doesn't take ownership. Must not be nullptr. Must be set in constructor.
135     ICULineBreakerPool* mPool;
136 
137     ICULineBreakerPool::Slot mIcuBreaker;
138 
139     UText mUText = UTEXT_INITIALIZER;
140     const uint16_t* mText = nullptr;
141     size_t mTextSize;
142     ssize_t mLast;
143     ssize_t mCurrent;
144 
145     // state for the email address / url detector
146     ssize_t mScanOffset;
147     bool mInEmailOrUrl;
148 };
149 
150 }  // namespace minikin
151 
152 #endif  // MINIKIN_WORD_BREAKER_H
153