1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdftext/cpdf_textpagefind.h"
8 
9 #include <cwchar>
10 #include <cwctype>
11 #include <vector>
12 
13 #include "core/fpdftext/cpdf_textpage.h"
14 #include "core/fxcrt/fx_string.h"
15 #include "core/fxcrt/fx_system.h"
16 #include "third_party/base/stl_util.h"
17 
18 namespace {
19 
IsIgnoreSpaceCharacter(FX_WCHAR curChar)20 bool IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
21   if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
22       (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
23       (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
24       (curChar >= 0x0400 && curChar <= 0x04FF) ||
25       (curChar >= 0x0500 && curChar <= 0x052F) ||
26       (curChar >= 0xA640 && curChar <= 0xA69F) ||
27       (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
28       (curChar >= 0x2000 && curChar <= 0x206F)) {
29     return false;
30   }
31   return true;
32 }
33 
34 }  // namespace
35 
CPDF_TextPageFind(const CPDF_TextPage * pTextPage)36 CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
37     : m_pTextPage(pTextPage),
38       m_flags(0),
39       m_findNextStart(-1),
40       m_findPreStart(-1),
41       m_bMatchCase(false),
42       m_bMatchWholeWord(false),
43       m_resStart(0),
44       m_resEnd(-1),
45       m_IsFind(false) {
46   m_strText = m_pTextPage->GetPageText();
47   int nCount = pTextPage->CountChars();
48   if (nCount)
49     m_CharIndex.push_back(0);
50   for (int i = 0; i < nCount; i++) {
51     FPDF_CHAR_INFO info;
52     pTextPage->GetCharInfo(i, &info);
53     int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
54     if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
55         info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
56       if (indexSize % 2) {
57         m_CharIndex.push_back(1);
58       } else {
59         if (indexSize <= 0)
60           continue;
61         m_CharIndex[indexSize - 1] += 1;
62       }
63     } else {
64       if (indexSize % 2) {
65         if (indexSize <= 0)
66           continue;
67         m_CharIndex[indexSize - 1] = i + 1;
68       } else {
69         m_CharIndex.push_back(i + 1);
70       }
71     }
72   }
73   int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
74   if (indexSize % 2)
75     m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
76 }
77 
~CPDF_TextPageFind()78 CPDF_TextPageFind::~CPDF_TextPageFind() {}
79 
GetCharIndex(int index) const80 int CPDF_TextPageFind::GetCharIndex(int index) const {
81   return m_pTextPage->CharIndexFromTextIndex(index);
82 }
83 
FindFirst(const CFX_WideString & findwhat,int flags,int startPos)84 bool CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
85                                   int flags,
86                                   int startPos) {
87   if (!m_pTextPage)
88     return false;
89   if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
90     m_strText = m_pTextPage->GetPageText();
91   CFX_WideString findwhatStr = findwhat;
92   m_findWhat = findwhatStr;
93   m_flags = flags;
94   m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
95   if (m_strText.IsEmpty()) {
96     m_IsFind = false;
97     return true;
98   }
99   FX_STRSIZE len = findwhatStr.GetLength();
100   if (!m_bMatchCase) {
101     findwhatStr.MakeLower();
102     m_strText.MakeLower();
103   }
104   m_bMatchWholeWord = !!(flags & FPDFTEXT_MATCHWHOLEWORD);
105   m_findNextStart = startPos;
106   if (startPos == -1)
107     m_findPreStart = m_strText.GetLength() - 1;
108   else
109     m_findPreStart = startPos;
110   m_csFindWhatArray.clear();
111   int i = 0;
112   while (i < len) {
113     if (findwhatStr.GetAt(i) != ' ')
114       break;
115     i++;
116   }
117   if (i < len)
118     ExtractFindWhat(findwhatStr);
119   else
120     m_csFindWhatArray.push_back(findwhatStr);
121   if (m_csFindWhatArray.empty())
122     return false;
123   m_IsFind = true;
124   m_resStart = 0;
125   m_resEnd = -1;
126   return true;
127 }
128 
FindNext()129 bool CPDF_TextPageFind::FindNext() {
130   if (!m_pTextPage)
131     return false;
132   m_resArray.clear();
133   if (m_findNextStart == -1)
134     return false;
135   if (m_strText.IsEmpty()) {
136     m_IsFind = false;
137     return m_IsFind;
138   }
139   int strLen = m_strText.GetLength();
140   if (m_findNextStart > strLen - 1) {
141     m_IsFind = false;
142     return m_IsFind;
143   }
144   int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
145   int nResultPos = 0;
146   int nStartPos = 0;
147   nStartPos = m_findNextStart;
148   bool bSpaceStart = false;
149   for (int iWord = 0; iWord < nCount; iWord++) {
150     CFX_WideString csWord = m_csFindWhatArray[iWord];
151     if (csWord.IsEmpty()) {
152       if (iWord == nCount - 1) {
153         FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
154         if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
155             strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
156           nResultPos = nStartPos + 1;
157           break;
158         }
159         iWord = -1;
160       } else if (iWord == 0) {
161         bSpaceStart = true;
162       }
163       continue;
164     }
165     int endIndex;
166     nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
167     if (nResultPos == -1) {
168       m_IsFind = false;
169       return m_IsFind;
170     }
171     endIndex = nResultPos + csWord.GetLength() - 1;
172     if (iWord == 0)
173       m_resStart = nResultPos;
174     bool bMatch = true;
175     if (iWord != 0 && !bSpaceStart) {
176       int PreResEndPos = nStartPos;
177       int curChar = csWord.GetAt(0);
178       CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
179       int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
180       if (nStartPos == nResultPos &&
181           !(IsIgnoreSpaceCharacter(lastChar) ||
182             IsIgnoreSpaceCharacter(curChar))) {
183         bMatch = false;
184       }
185       for (int d = PreResEndPos; d < nResultPos; d++) {
186         FX_WCHAR strInsert = m_strText.GetAt(d);
187         if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
188             strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
189           bMatch = false;
190           break;
191         }
192       }
193     } else if (bSpaceStart) {
194       if (nResultPos > 0) {
195         FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
196         if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
197             strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
198           bMatch = false;
199           m_resStart = nResultPos;
200         } else {
201           m_resStart = nResultPos - 1;
202         }
203       }
204     }
205     if (m_bMatchWholeWord && bMatch) {
206       bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
207     }
208     nStartPos = endIndex + 1;
209     if (!bMatch) {
210       iWord = -1;
211       if (bSpaceStart)
212         nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
213       else
214         nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
215     }
216   }
217   m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1;
218   m_IsFind = true;
219   int resStart = GetCharIndex(m_resStart);
220   int resEnd = GetCharIndex(m_resEnd);
221   m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
222   if (m_flags & FPDFTEXT_CONSECUTIVE) {
223     m_findNextStart = m_resStart + 1;
224     m_findPreStart = m_resEnd - 1;
225   } else {
226     m_findNextStart = m_resEnd + 1;
227     m_findPreStart = m_resStart - 1;
228   }
229   return m_IsFind;
230 }
231 
FindPrev()232 bool CPDF_TextPageFind::FindPrev() {
233   if (!m_pTextPage)
234     return false;
235   m_resArray.clear();
236   if (m_strText.IsEmpty() || m_findPreStart < 0) {
237     m_IsFind = false;
238     return m_IsFind;
239   }
240   CPDF_TextPageFind findEngine(m_pTextPage);
241   bool ret = findEngine.FindFirst(m_findWhat, m_flags);
242   if (!ret) {
243     m_IsFind = false;
244     return m_IsFind;
245   }
246   int order = -1, MatchedCount = 0;
247   while (ret) {
248     ret = findEngine.FindNext();
249     if (ret) {
250       int order1 = findEngine.GetCurOrder();
251       int MatchedCount1 = findEngine.GetMatchedCount();
252       if (((order1 + MatchedCount1) - 1) > m_findPreStart)
253         break;
254       order = order1;
255       MatchedCount = MatchedCount1;
256     }
257   }
258   if (order == -1) {
259     m_IsFind = false;
260     return m_IsFind;
261   }
262   m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
263   m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
264   m_IsFind = true;
265   m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
266   if (m_flags & FPDFTEXT_CONSECUTIVE) {
267     m_findNextStart = m_resStart + 1;
268     m_findPreStart = m_resEnd - 1;
269   } else {
270     m_findNextStart = m_resEnd + 1;
271     m_findPreStart = m_resStart - 1;
272   }
273   return m_IsFind;
274 }
275 
ExtractFindWhat(const CFX_WideString & findwhat)276 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
277   if (findwhat.IsEmpty())
278     return;
279   int index = 0;
280   while (1) {
281     CFX_WideString csWord = TEXT_EMPTY;
282     int ret =
283         ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR);
284     if (csWord.IsEmpty()) {
285       if (ret) {
286         m_csFindWhatArray.push_back(L"");
287         index++;
288         continue;
289       } else {
290         break;
291       }
292     }
293     int pos = 0;
294     while (pos < csWord.GetLength()) {
295       CFX_WideString curStr = csWord.Mid(pos, 1);
296       FX_WCHAR curChar = csWord.GetAt(pos);
297       if (IsIgnoreSpaceCharacter(curChar)) {
298         if (pos > 0 && curChar == 0x2019) {
299           pos++;
300           continue;
301         }
302         if (pos > 0)
303           m_csFindWhatArray.push_back(csWord.Mid(0, pos));
304         m_csFindWhatArray.push_back(curStr);
305         if (pos == csWord.GetLength() - 1) {
306           csWord.clear();
307           break;
308         }
309         csWord = csWord.Right(csWord.GetLength() - pos - 1);
310         pos = 0;
311         continue;
312       }
313       pos++;
314     }
315     if (!csWord.IsEmpty())
316       m_csFindWhatArray.push_back(csWord);
317     index++;
318   }
319 }
320 
IsMatchWholeWord(const CFX_WideString & csPageText,int startPos,int endPos)321 bool CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
322                                          int startPos,
323                                          int endPos) {
324   FX_WCHAR char_left = 0;
325   FX_WCHAR char_right = 0;
326   int char_count = endPos - startPos + 1;
327   if (char_count < 1)
328     return false;
329   if (char_count == 1 && csPageText.GetAt(startPos) > 255)
330     return true;
331   if (startPos - 1 >= 0)
332     char_left = csPageText.GetAt(startPos - 1);
333   if (startPos + char_count < csPageText.GetLength())
334     char_right = csPageText.GetAt(startPos + char_count);
335   if ((char_left > 'A' && char_left < 'a') ||
336       (char_left > 'a' && char_left < 'z') ||
337       (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
338       (char_right > 'A' && char_right < 'a') ||
339       (char_right > 'a' && char_right < 'z') ||
340       (char_right > 0xfb00 && char_right < 0xfb06) ||
341       std::iswdigit(char_right)) {
342     return false;
343   }
344   if (!(('A' > char_left || char_left > 'Z') &&
345         ('a' > char_left || char_left > 'z') &&
346         ('A' > char_right || char_right > 'Z') &&
347         ('a' > char_right || char_right > 'z'))) {
348     return false;
349   }
350   if (char_count > 0) {
351     if (csPageText.GetAt(startPos) >= L'0' &&
352         csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
353         char_left <= L'9') {
354       return false;
355     }
356     if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
357         char_right >= L'0' && char_right <= L'9') {
358       return false;
359     }
360   }
361   return true;
362 }
363 
ExtractSubString(CFX_WideString & rString,const FX_WCHAR * lpszFullString,int iSubString,FX_WCHAR chSep)364 bool CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
365                                          const FX_WCHAR* lpszFullString,
366                                          int iSubString,
367                                          FX_WCHAR chSep) {
368   if (!lpszFullString)
369     return false;
370   while (iSubString--) {
371     lpszFullString = std::wcschr(lpszFullString, chSep);
372     if (!lpszFullString) {
373       rString.clear();
374       return false;
375     }
376     lpszFullString++;
377     while (*lpszFullString == chSep)
378       lpszFullString++;
379   }
380   const FX_WCHAR* lpchEnd = std::wcschr(lpszFullString, chSep);
381   int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
382                      : (int)FXSYS_wcslen(lpszFullString);
383   ASSERT(nLen >= 0);
384   FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
385                nLen * sizeof(FX_WCHAR));
386   rString.ReleaseBuffer();
387   return true;
388 }
389 
MakeReverse(const CFX_WideString & str)390 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
391   CFX_WideString str2;
392   str2.clear();
393   int nlen = str.GetLength();
394   for (int i = nlen - 1; i >= 0; i--)
395     str2 += str.GetAt(i);
396   return str2;
397 }
398 
GetCurOrder() const399 int CPDF_TextPageFind::GetCurOrder() const {
400   return GetCharIndex(m_resStart);
401 }
402 
GetMatchedCount() const403 int CPDF_TextPageFind::GetMatchedCount() const {
404   int resStart = GetCharIndex(m_resStart);
405   int resEnd = GetCharIndex(m_resEnd);
406   return resEnd - resStart + 1;
407 }
408