1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdftext/cpdf_textpagefind.h"
8 
9 #include <cwchar>
10 #include <cwctype>
11 #include <vector>
12 
13 #include "core/fpdftext/cpdf_textpage.h"
14 #include "core/fxcrt/fx_extension.h"
15 #include "core/fxcrt/fx_string.h"
16 #include "core/fxcrt/fx_system.h"
17 #include "third_party/base/ptr_util.h"
18 #include "third_party/base/stl_util.h"
19 
20 namespace {
21 
22 constexpr wchar_t kNonBreakingSpace = 160;
23 
IsIgnoreSpaceCharacter(wchar_t curChar)24 bool IsIgnoreSpaceCharacter(wchar_t curChar) {
25   if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
26       (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
27       (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
28       (curChar >= 0x0400 && curChar <= 0x04FF) ||
29       (curChar >= 0x0500 && curChar <= 0x052F) ||
30       (curChar >= 0xA640 && curChar <= 0xA69F) ||
31       (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
32       (curChar >= 0x2000 && curChar <= 0x206F)) {
33     return false;
34   }
35   return true;
36 }
37 
IsMatchWholeWord(const WideString & csPageText,size_t startPos,size_t endPos)38 bool IsMatchWholeWord(const WideString& csPageText,
39                       size_t startPos,
40                       size_t endPos) {
41   if (startPos > endPos)
42     return false;
43   wchar_t char_left = 0;
44   wchar_t char_right = 0;
45   size_t char_count = endPos - startPos + 1;
46   if (char_count == 0)
47     return false;
48   if (char_count == 1 && csPageText[startPos] > 255)
49     return true;
50   if (startPos >= 1)
51     char_left = csPageText[startPos - 1];
52   if (startPos + char_count < csPageText.GetLength())
53     char_right = csPageText[startPos + char_count];
54   if ((char_left > 'A' && char_left < 'a') ||
55       (char_left > 'a' && char_left < 'z') ||
56       (char_left > 0xfb00 && char_left < 0xfb06) ||
57       FXSYS_IsDecimalDigit(char_left) ||
58       (char_right > 'A' && char_right < 'a') ||
59       (char_right > 'a' && char_right < 'z') ||
60       (char_right > 0xfb00 && char_right < 0xfb06) ||
61       FXSYS_IsDecimalDigit(char_right)) {
62     return false;
63   }
64   if (!(('A' > char_left || char_left > 'Z') &&
65         ('a' > char_left || char_left > 'z') &&
66         ('A' > char_right || char_right > 'Z') &&
67         ('a' > char_right || char_right > 'z'))) {
68     return false;
69   }
70   if (char_count > 0) {
71     if (FXSYS_IsDecimalDigit(char_left) &&
72         FXSYS_IsDecimalDigit(csPageText[startPos])) {
73       return false;
74     }
75     if (FXSYS_IsDecimalDigit(char_right) &&
76         FXSYS_IsDecimalDigit(csPageText[endPos])) {
77       return false;
78     }
79   }
80   return true;
81 }
82 
GetStringCase(const WideString & wsOriginal,bool bMatchCase)83 WideString GetStringCase(const WideString& wsOriginal, bool bMatchCase) {
84   if (bMatchCase)
85     return wsOriginal;
86 
87   WideString wsLower = wsOriginal;
88   wsLower.MakeLower();
89   return wsLower;
90 }
91 
ExtractSubString(const wchar_t * lpszFullString,int iSubString)92 Optional<WideString> ExtractSubString(const wchar_t* lpszFullString,
93                                       int iSubString) {
94   ASSERT(lpszFullString);
95 
96   while (iSubString--) {
97     lpszFullString = std::wcschr(lpszFullString, L' ');
98     if (!lpszFullString)
99       return {};
100 
101     lpszFullString++;
102     while (*lpszFullString == L' ')
103       lpszFullString++;
104   }
105 
106   const wchar_t* lpchEnd = std::wcschr(lpszFullString, L' ');
107   int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString)
108                      : static_cast<int>(wcslen(lpszFullString));
109   if (nLen < 0)
110     return {};
111 
112   return WideString(lpszFullString, static_cast<size_t>(nLen));
113 }
114 
ExtractFindWhat(const WideString & findwhat)115 std::vector<WideString> ExtractFindWhat(const WideString& findwhat) {
116   std::vector<WideString> findwhat_array;
117 
118   size_t len = findwhat.GetLength();
119   size_t i = 0;
120   for (i = 0; i < len; ++i)
121     if (findwhat[i] != ' ')
122       break;
123   if (i == len) {
124     findwhat_array.push_back(findwhat);
125     return findwhat_array;
126   }
127 
128   int index = 0;
129   while (1) {
130     Optional<WideString> word = ExtractSubString(findwhat.c_str(), index);
131     if (!word)
132       break;
133 
134     if (word->IsEmpty()) {
135       findwhat_array.push_back(L"");
136       index++;
137       continue;
138     }
139 
140     size_t pos = 0;
141     while (pos < word->GetLength()) {
142       WideString curStr = word->Substr(pos, 1);
143       wchar_t curChar = (*word)[pos];
144       if (IsIgnoreSpaceCharacter(curChar)) {
145         if (pos > 0 && curChar == 0x2019) {
146           pos++;
147           continue;
148         }
149         if (pos > 0)
150           findwhat_array.push_back(word->First(pos));
151         findwhat_array.push_back(curStr);
152         if (pos == word->GetLength() - 1) {
153           word->clear();
154           break;
155         }
156         word.emplace(word->Last(word->GetLength() - pos - 1));
157         pos = 0;
158         continue;
159       }
160       pos++;
161     }
162 
163     if (!word->IsEmpty())
164       findwhat_array.push_back(word.value());
165     index++;
166   }
167   return findwhat_array;
168 }
169 
170 }  // namespace
171 
172 // static
Create(const CPDF_TextPage * pTextPage,const WideString & findwhat,const Options & options,Optional<size_t> startPos)173 std::unique_ptr<CPDF_TextPageFind> CPDF_TextPageFind::Create(
174     const CPDF_TextPage* pTextPage,
175     const WideString& findwhat,
176     const Options& options,
177     Optional<size_t> startPos) {
178   std::vector<WideString> findwhat_array =
179       ExtractFindWhat(GetStringCase(findwhat, options.bMatchCase));
180   auto find = pdfium::WrapUnique(
181       new CPDF_TextPageFind(pTextPage, findwhat_array, options, startPos));
182   find->FindFirst();
183   return find;
184 }
185 
CPDF_TextPageFind(const CPDF_TextPage * pTextPage,const std::vector<WideString> & findwhat_array,const Options & options,Optional<size_t> startPos)186 CPDF_TextPageFind::CPDF_TextPageFind(
187     const CPDF_TextPage* pTextPage,
188     const std::vector<WideString>& findwhat_array,
189     const Options& options,
190     Optional<size_t> startPos)
191     : m_pTextPage(pTextPage),
192       m_strText(GetStringCase(pTextPage->GetAllPageText(), options.bMatchCase)),
193       m_csFindWhatArray(findwhat_array),
194       m_options(options) {
195   if (!m_strText.IsEmpty()) {
196     m_findNextStart = startPos;
197     m_findPreStart = startPos.value_or(m_strText.GetLength() - 1);
198   }
199 }
200 
201 CPDF_TextPageFind::~CPDF_TextPageFind() = default;
202 
GetCharIndex(int index) const203 int CPDF_TextPageFind::GetCharIndex(int index) const {
204   return m_pTextPage->CharIndexFromTextIndex(index);
205 }
206 
FindFirst()207 bool CPDF_TextPageFind::FindFirst() {
208   return m_strText.IsEmpty() || !m_csFindWhatArray.empty();
209 }
210 
FindNext()211 bool CPDF_TextPageFind::FindNext() {
212   if (m_strText.IsEmpty() || !m_findNextStart.has_value())
213     return false;
214 
215   size_t strLen = m_strText.GetLength();
216   if (m_findNextStart.value() > strLen - 1)
217     return false;
218 
219   int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
220   Optional<size_t> nResultPos = 0;
221   size_t nStartPos = m_findNextStart.value();
222   bool bSpaceStart = false;
223   for (int iWord = 0; iWord < nCount; iWord++) {
224     WideString csWord = m_csFindWhatArray[iWord];
225     if (csWord.IsEmpty()) {
226       if (iWord == nCount - 1) {
227         wchar_t strInsert = m_strText[nStartPos];
228         if (strInsert == L'\n' || strInsert == L' ' || strInsert == L'\r' ||
229             strInsert == kNonBreakingSpace) {
230           nResultPos = nStartPos + 1;
231           break;
232         }
233         iWord = -1;
234       } else if (iWord == 0) {
235         bSpaceStart = true;
236       }
237       continue;
238     }
239     nResultPos = m_strText.Find(csWord.AsStringView(), nStartPos);
240     if (!nResultPos.has_value())
241       return false;
242 
243     size_t endIndex = nResultPos.value() + csWord.GetLength() - 1;
244     if (iWord == 0)
245       m_resStart = nResultPos.value();
246     bool bMatch = true;
247     if (iWord != 0 && !bSpaceStart) {
248       size_t PreResEndPos = nStartPos;
249       int curChar = csWord[0];
250       WideString lastWord = m_csFindWhatArray[iWord - 1];
251       int lastChar = lastWord.Back();
252       if (nStartPos == nResultPos.value() &&
253           !(IsIgnoreSpaceCharacter(lastChar) ||
254             IsIgnoreSpaceCharacter(curChar))) {
255         bMatch = false;
256       }
257       for (size_t d = PreResEndPos; d < nResultPos.value(); d++) {
258         wchar_t strInsert = m_strText[d];
259         if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' &&
260             strInsert != kNonBreakingSpace) {
261           bMatch = false;
262           break;
263         }
264       }
265     } else if (bSpaceStart) {
266       if (nResultPos.value() > 0) {
267         wchar_t strInsert = m_strText[nResultPos.value() - 1];
268         if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' &&
269             strInsert != kNonBreakingSpace) {
270           bMatch = false;
271           m_resStart = nResultPos.value();
272         } else {
273           m_resStart = nResultPos.value() - 1;
274         }
275       }
276     }
277     if (m_options.bMatchWholeWord && bMatch)
278       bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex);
279 
280     nStartPos = endIndex + 1;
281     if (!bMatch) {
282       iWord = -1;
283       size_t index = bSpaceStart ? 1 : 0;
284       nStartPos = m_resStart + m_csFindWhatArray[index].GetLength();
285     }
286   }
287   m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1;
288   if (m_options.bConsecutive) {
289     m_findNextStart = m_resStart + 1;
290     m_findPreStart = m_resEnd - 1;
291   } else {
292     m_findNextStart = m_resEnd + 1;
293     m_findPreStart = m_resStart - 1;
294   }
295   return true;
296 }
297 
FindPrev()298 bool CPDF_TextPageFind::FindPrev() {
299   if (m_strText.IsEmpty() || !m_findPreStart.has_value())
300     return false;
301 
302   CPDF_TextPageFind find_engine(m_pTextPage.Get(), m_csFindWhatArray, m_options,
303                                 0);
304   if (!find_engine.FindFirst())
305     return false;
306 
307   int order = -1;
308   int matches = 0;
309   while (find_engine.FindNext()) {
310     int cur_order = find_engine.GetCurOrder();
311     int cur_match = find_engine.GetMatchedCount();
312     int temp = cur_order + cur_match;
313     if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1)
314       break;
315 
316     order = cur_order;
317     matches = cur_match;
318   }
319   if (order == -1)
320     return false;
321 
322   m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
323   m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + matches - 1);
324   if (m_options.bConsecutive) {
325     m_findNextStart = m_resStart + 1;
326     m_findPreStart = m_resEnd - 1;
327   } else {
328     m_findNextStart = m_resEnd + 1;
329     m_findPreStart = m_resStart - 1;
330   }
331   return true;
332 }
333 
GetCurOrder() const334 int CPDF_TextPageFind::GetCurOrder() const {
335   return GetCharIndex(m_resStart);
336 }
337 
GetMatchedCount() const338 int CPDF_TextPageFind::GetMatchedCount() const {
339   int resStart = GetCharIndex(m_resStart);
340   int resEnd = GetCharIndex(m_resEnd);
341   return resEnd - resStart + 1;
342 }
343