1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdftext/cpdf_textpagefind.h"
8 
9 #include <cwchar>
10 #include <cwctype>
11 #include <vector>
12 
13 #include "core/fpdftext/cpdf_textpage.h"
14 #include "core/fxcrt/fx_string.h"
15 #include "core/fxcrt/fx_system.h"
16 #include "third_party/base/stl_util.h"
17 
18 namespace {
19 
IsIgnoreSpaceCharacter(wchar_t curChar)20 bool IsIgnoreSpaceCharacter(wchar_t curChar) {
21   if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
22       (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
23       (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
24       (curChar >= 0x0400 && curChar <= 0x04FF) ||
25       (curChar >= 0x0500 && curChar <= 0x052F) ||
26       (curChar >= 0xA640 && curChar <= 0xA69F) ||
27       (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
28       (curChar >= 0x2000 && curChar <= 0x206F)) {
29     return false;
30   }
31   return true;
32 }
33 
34 }  // namespace
35 
CPDF_TextPageFind(const CPDF_TextPage * pTextPage)36 CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
37     : m_pTextPage(pTextPage),
38       m_flags(0),
39       m_bMatchCase(false),
40       m_bMatchWholeWord(false),
41       m_resStart(0),
42       m_resEnd(-1),
43       m_IsFind(false) {
44   m_strText = m_pTextPage->GetAllPageText();
45   int nCount = pTextPage->CountChars();
46   if (nCount)
47     m_CharIndex.push_back(0);
48   for (int i = 0; i < nCount; i++) {
49     FPDF_CHAR_INFO info;
50     pTextPage->GetCharInfo(i, &info);
51     int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
52     if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
53         info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
54       if (indexSize % 2) {
55         m_CharIndex.push_back(1);
56       } else {
57         if (indexSize <= 0)
58           continue;
59         m_CharIndex[indexSize - 1] += 1;
60       }
61     } else {
62       if (indexSize % 2) {
63         if (indexSize <= 0)
64           continue;
65         m_CharIndex[indexSize - 1] = i + 1;
66       } else {
67         m_CharIndex.push_back(i + 1);
68       }
69     }
70   }
71   int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
72   if (indexSize % 2)
73     m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
74 }
75 
~CPDF_TextPageFind()76 CPDF_TextPageFind::~CPDF_TextPageFind() {}
77 
GetCharIndex(int index) const78 int CPDF_TextPageFind::GetCharIndex(int index) const {
79   return m_pTextPage->CharIndexFromTextIndex(index);
80 }
81 
FindFirst(const WideString & findwhat,int flags,Optional<size_t> startPos)82 bool CPDF_TextPageFind::FindFirst(const WideString& findwhat,
83                                   int flags,
84                                   Optional<size_t> startPos) {
85   if (!m_pTextPage)
86     return false;
87   if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
88     m_strText = m_pTextPage->GetAllPageText();
89   WideString findwhatStr = findwhat;
90   m_findWhat = findwhatStr;
91   m_flags = flags;
92   m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
93   if (m_strText.IsEmpty()) {
94     m_IsFind = false;
95     return true;
96   }
97   size_t len = findwhatStr.GetLength();
98   if (!m_bMatchCase) {
99     findwhatStr.MakeLower();
100     m_strText.MakeLower();
101   }
102   m_bMatchWholeWord = !!(flags & FPDFTEXT_MATCHWHOLEWORD);
103   m_findNextStart = startPos;
104   if (!startPos.has_value()) {
105     if (!m_strText.IsEmpty())
106       m_findPreStart = m_strText.GetLength() - 1;
107   } else {
108     m_findPreStart = startPos;
109   }
110 
111   m_csFindWhatArray.clear();
112   size_t i = 0;
113   for (i = 0; i < len; ++i)
114     if (findwhatStr[i] != ' ')
115       break;
116   if (i < len)
117     ExtractFindWhat(findwhatStr);
118   else
119     m_csFindWhatArray.push_back(findwhatStr);
120   if (m_csFindWhatArray.empty())
121     return false;
122 
123   m_IsFind = true;
124   m_resStart = 0;
125   m_resEnd = -1;
126   return true;
127 }
128 
FindNext()129 bool CPDF_TextPageFind::FindNext() {
130   if (!m_pTextPage)
131     return false;
132   m_resArray.clear();
133   if (!m_findNextStart.has_value())
134     return false;
135   if (m_strText.IsEmpty()) {
136     m_IsFind = false;
137     return m_IsFind;
138   }
139   size_t strLen = m_strText.GetLength();
140   if (m_findNextStart.value() > strLen - 1) {
141     m_IsFind = false;
142     return m_IsFind;
143   }
144   int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
145   Optional<size_t> nResultPos = 0;
146   size_t nStartPos = m_findNextStart.value();
147   bool bSpaceStart = false;
148   for (int iWord = 0; iWord < nCount; iWord++) {
149     WideString csWord = m_csFindWhatArray[iWord];
150     if (csWord.IsEmpty()) {
151       if (iWord == nCount - 1) {
152         wchar_t strInsert = m_strText[nStartPos];
153         if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
154             strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
155           nResultPos = nStartPos + 1;
156           break;
157         }
158         iWord = -1;
159       } else if (iWord == 0) {
160         bSpaceStart = true;
161       }
162       continue;
163     }
164     nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
165     if (!nResultPos.has_value()) {
166       m_IsFind = false;
167       return m_IsFind;
168     }
169     size_t endIndex = nResultPos.value() + csWord.GetLength() - 1;
170     if (iWord == 0)
171       m_resStart = nResultPos.value();
172     bool bMatch = true;
173     if (iWord != 0 && !bSpaceStart) {
174       size_t PreResEndPos = nStartPos;
175       int curChar = csWord[0];
176       WideString lastWord = m_csFindWhatArray[iWord - 1];
177       int lastChar = lastWord[lastWord.GetLength() - 1];
178       if (nStartPos == nResultPos.value() &&
179           !(IsIgnoreSpaceCharacter(lastChar) ||
180             IsIgnoreSpaceCharacter(curChar))) {
181         bMatch = false;
182       }
183       for (size_t d = PreResEndPos; d < nResultPos.value(); d++) {
184         wchar_t strInsert = m_strText[d];
185         if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
186             strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
187           bMatch = false;
188           break;
189         }
190       }
191     } else if (bSpaceStart) {
192       if (nResultPos.value() > 0) {
193         wchar_t strInsert = m_strText[nResultPos.value() - 1];
194         if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
195             strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
196           bMatch = false;
197           m_resStart = nResultPos.value();
198         } else {
199           m_resStart = nResultPos.value() - 1;
200         }
201       }
202     }
203     if (m_bMatchWholeWord && bMatch) {
204       bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex);
205     }
206     nStartPos = endIndex + 1;
207     if (!bMatch) {
208       iWord = -1;
209       if (bSpaceStart)
210         nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
211       else
212         nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
213     }
214   }
215   m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1;
216   m_IsFind = true;
217   int resStart = GetCharIndex(m_resStart);
218   int resEnd = GetCharIndex(m_resEnd);
219   m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
220   if (m_flags & FPDFTEXT_CONSECUTIVE) {
221     m_findNextStart = m_resStart + 1;
222     m_findPreStart = m_resEnd - 1;
223   } else {
224     m_findNextStart = m_resEnd + 1;
225     m_findPreStart = m_resStart - 1;
226   }
227   return m_IsFind;
228 }
229 
FindPrev()230 bool CPDF_TextPageFind::FindPrev() {
231   if (!m_pTextPage)
232     return false;
233   m_resArray.clear();
234   if (m_strText.IsEmpty() || !m_findPreStart.has_value()) {
235     m_IsFind = false;
236     return m_IsFind;
237   }
238   CPDF_TextPageFind findEngine(m_pTextPage.Get());
239   bool ret = findEngine.FindFirst(m_findWhat, m_flags, Optional<size_t>(0));
240   if (!ret) {
241     m_IsFind = false;
242     return m_IsFind;
243   }
244   int order = -1;
245   int MatchedCount = 0;
246   while (ret) {
247     ret = findEngine.FindNext();
248     if (ret) {
249       int order1 = findEngine.GetCurOrder();
250       int MatchedCount1 = findEngine.GetMatchedCount();
251       int temp = order1 + MatchedCount1;
252       if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1)
253         break;
254       order = order1;
255       MatchedCount = MatchedCount1;
256     }
257   }
258   if (order == -1) {
259     m_IsFind = false;
260     return m_IsFind;
261   }
262   m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
263   m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
264   m_IsFind = true;
265   m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
266   if (m_flags & FPDFTEXT_CONSECUTIVE) {
267     m_findNextStart = m_resStart + 1;
268     m_findPreStart = m_resEnd - 1;
269   } else {
270     m_findNextStart = m_resEnd + 1;
271     m_findPreStart = m_resStart - 1;
272   }
273   return m_IsFind;
274 }
275 
ExtractFindWhat(const WideString & findwhat)276 void CPDF_TextPageFind::ExtractFindWhat(const WideString& findwhat) {
277   if (findwhat.IsEmpty())
278     return;
279   int index = 0;
280   while (1) {
281     Optional<WideString> word =
282         ExtractSubString(findwhat.c_str(), index, TEXT_SPACE_CHAR);
283     if (!word)
284       break;
285 
286     if (word->IsEmpty()) {
287       m_csFindWhatArray.push_back(L"");
288       index++;
289       continue;
290     }
291 
292     size_t pos = 0;
293     while (pos < word->GetLength()) {
294       WideString curStr = word->Mid(pos, 1);
295       wchar_t curChar = word->operator[](pos);
296       if (IsIgnoreSpaceCharacter(curChar)) {
297         if (pos > 0 && curChar == 0x2019) {
298           pos++;
299           continue;
300         }
301         if (pos > 0)
302           m_csFindWhatArray.push_back(word->Left(pos));
303         m_csFindWhatArray.push_back(curStr);
304         if (pos == word->GetLength() - 1) {
305           word->clear();
306           break;
307         }
308         word.emplace(word->Right(word->GetLength() - pos - 1));
309         pos = 0;
310         continue;
311       }
312       pos++;
313     }
314 
315     if (!word->IsEmpty())
316       m_csFindWhatArray.push_back(word.value());
317     index++;
318   }
319 }
320 
IsMatchWholeWord(const WideString & csPageText,size_t startPos,size_t endPos)321 bool CPDF_TextPageFind::IsMatchWholeWord(const WideString& csPageText,
322                                          size_t startPos,
323                                          size_t endPos) {
324   if (startPos > endPos)
325     return false;
326   wchar_t char_left = 0;
327   wchar_t char_right = 0;
328   size_t char_count = endPos - startPos + 1;
329   if (char_count == 0)
330     return false;
331   if (char_count == 1 && csPageText[startPos] > 255)
332     return true;
333   if (startPos >= 1)
334     char_left = csPageText[startPos - 1];
335   if (startPos + char_count < csPageText.GetLength())
336     char_right = csPageText[startPos + char_count];
337   if ((char_left > 'A' && char_left < 'a') ||
338       (char_left > 'a' && char_left < 'z') ||
339       (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
340       (char_right > 'A' && char_right < 'a') ||
341       (char_right > 'a' && char_right < 'z') ||
342       (char_right > 0xfb00 && char_right < 0xfb06) ||
343       std::iswdigit(char_right)) {
344     return false;
345   }
346   if (!(('A' > char_left || char_left > 'Z') &&
347         ('a' > char_left || char_left > 'z') &&
348         ('A' > char_right || char_right > 'Z') &&
349         ('a' > char_right || char_right > 'z'))) {
350     return false;
351   }
352   if (char_count > 0) {
353     if (std::iswdigit(char_left) && std::iswdigit(csPageText[startPos]))
354       return false;
355     if (std::iswdigit(char_right) && std::iswdigit(csPageText[endPos]))
356       return false;
357   }
358   return true;
359 }
360 
ExtractSubString(const wchar_t * lpszFullString,int iSubString,wchar_t chSep)361 Optional<WideString> CPDF_TextPageFind::ExtractSubString(
362     const wchar_t* lpszFullString,
363     int iSubString,
364     wchar_t chSep) {
365   if (!lpszFullString)
366     return {};
367 
368   while (iSubString--) {
369     lpszFullString = std::wcschr(lpszFullString, chSep);
370     if (!lpszFullString)
371       return {};
372 
373     lpszFullString++;
374     while (*lpszFullString == chSep)
375       lpszFullString++;
376   }
377 
378   const wchar_t* lpchEnd = std::wcschr(lpszFullString, chSep);
379   int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString)
380                      : static_cast<int>(wcslen(lpszFullString));
381   if (nLen < 0)
382     return {};
383 
384   return {WideString(lpszFullString, static_cast<size_t>(nLen))};
385 }
386 
GetCurOrder() const387 int CPDF_TextPageFind::GetCurOrder() const {
388   return GetCharIndex(m_resStart);
389 }
390 
GetMatchedCount() const391 int CPDF_TextPageFind::GetMatchedCount() const {
392   int resStart = GetCharIndex(m_resStart);
393   int resEnd = GetCharIndex(m_resEnd);
394   return resEnd - resStart + 1;
395 }
396