1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include <algorithm>
8 #include <cctype>
9 #include <cwctype>
10 #include <memory>
11
12 #include "core/include/fpdfapi/fpdf_module.h"
13 #include "core/include/fpdfapi/fpdf_page.h"
14 #include "core/include/fpdfapi/fpdf_pageobj.h"
15 #include "core/include/fpdfapi/fpdf_resource.h"
16 #include "core/include/fpdftext/fpdf_text.h"
17 #include "core/include/fxcrt/fx_bidi.h"
18 #include "core/include/fxcrt/fx_ext.h"
19 #include "core/include/fxcrt/fx_ucd.h"
20 #include "text_int.h"
21
22 namespace {
23
_IsIgnoreSpaceCharacter(FX_WCHAR curChar)24 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
25 if (curChar < 255) {
26 return FALSE;
27 }
28 if ((curChar >= 0x0600 && curChar <= 0x06FF) ||
29 (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
30 (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
31 (curChar >= 0x0400 && curChar <= 0x04FF) ||
32 (curChar >= 0x0500 && curChar <= 0x052F) ||
33 (curChar >= 0xA640 && curChar <= 0xA69F) ||
34 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
35 (curChar >= 0x2000 && curChar <= 0x206F)) {
36 return FALSE;
37 }
38 return TRUE;
39 }
40
_NormalizeThreshold(FX_FLOAT threshold)41 FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) {
42 if (threshold < 300) {
43 return threshold / 2.0f;
44 }
45 if (threshold < 500) {
46 return threshold / 4.0f;
47 }
48 if (threshold < 700) {
49 return threshold / 5.0f;
50 }
51 return threshold / 6.0f;
52 }
53
_CalculateBaseSpace(const CPDF_TextObject * pTextObj,const CFX_Matrix & matrix)54 FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj,
55 const CFX_Matrix& matrix) {
56 FX_FLOAT baseSpace = 0.0;
57 const int nItems = pTextObj->CountItems();
58 if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) {
59 FX_BOOL bAllChar = TRUE;
60 FX_FLOAT spacing = matrix.TransformDistance(
61 pTextObj->m_TextState.GetObject()->m_CharSpace);
62 baseSpace = spacing;
63 for (int i = 0; i < nItems; i++) {
64 CPDF_TextObjectItem item;
65 pTextObj->GetItemInfo(i, &item);
66 if (item.m_CharCode == (FX_DWORD)-1) {
67 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
68 FX_FLOAT kerning = -fontsize_h * item.m_OriginX / 1000;
69 baseSpace = std::min(baseSpace, kerning + spacing);
70 bAllChar = FALSE;
71 }
72 }
73 if (baseSpace < 0.0 || (nItems == 3 && !bAllChar)) {
74 baseSpace = 0.0;
75 }
76 }
77 return baseSpace;
78 }
79
80 const FX_FLOAT kDefaultFontSize = 1.0f;
81
82 } // namespace
83
CPDFText_ParseOptions()84 CPDFText_ParseOptions::CPDFText_ParseOptions()
85 : m_bGetCharCodeOnly(FALSE),
86 m_bNormalizeObjs(TRUE),
87 m_bOutputHyphen(FALSE) {}
88
CreateTextPage(const CPDF_Page * pPage,int flags)89 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage,
90 int flags) {
91 return new CPDF_TextPage(pPage, flags);
92 }
93
CreatePageFind(const IPDF_TextPage * pTextPage)94 IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind(
95 const IPDF_TextPage* pTextPage) {
96 return pTextPage ? new CPDF_TextPageFind(pTextPage) : nullptr;
97 }
98
CreateLinkExtract()99 IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() {
100 return new CPDF_LinkExtract();
101 }
102
103 #define TEXT_BLANK_CHAR L' '
104 #define TEXT_LINEFEED_CHAR L'\n'
105 #define TEXT_RETURN_CHAR L'\r'
106 #define TEXT_EMPTY L""
107 #define TEXT_BLANK L" "
108 #define TEXT_RETURN_LINEFEED L"\r\n"
109 #define TEXT_LINEFEED L"\n"
110 #define TEXT_CHARRATIO_GAPDELTA 0.070
111
CPDF_TextPage(const CPDF_Page * pPage,int flags)112 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags)
113 : m_pPage(pPage),
114 m_charList(512),
115 m_TempCharList(50),
116 m_parserflag(flags),
117 m_pPreTextObj(nullptr),
118 m_bIsParsed(false),
119 m_TextlineDir(-1),
120 m_CurlineRect(0, 0, 0, 0) {
121 m_TextBuf.EstimateSize(0, 10240);
122 pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int)pPage->GetPageWidth(),
123 (int)pPage->GetPageHeight(), 0);
124 }
125
NormalizeObjects(FX_BOOL bNormalize)126 void CPDF_TextPage::NormalizeObjects(FX_BOOL bNormalize) {
127 m_ParseOptions.m_bNormalizeObjs = bNormalize;
128 }
IsControlChar(const PAGECHAR_INFO & charInfo)129 bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) {
130 switch (charInfo.m_Unicode) {
131 case 0x2:
132 case 0x3:
133 case 0x93:
134 case 0x94:
135 case 0x96:
136 case 0x97:
137 case 0x98:
138 case 0xfffe:
139 return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN;
140 default:
141 return false;
142 }
143 }
ParseTextPage()144 FX_BOOL CPDF_TextPage::ParseTextPage() {
145 m_bIsParsed = false;
146 if (!m_pPage)
147 return FALSE;
148
149 m_TextBuf.Clear();
150 m_charList.RemoveAll();
151 m_pPreTextObj = NULL;
152 ProcessObject();
153 m_bIsParsed = true;
154 if (!m_ParseOptions.m_bGetCharCodeOnly) {
155 m_CharIndex.RemoveAll();
156 int nCount = m_charList.GetSize();
157 if (nCount) {
158 m_CharIndex.Add(0);
159 }
160 for (int i = 0; i < nCount; i++) {
161 int indexSize = m_CharIndex.GetSize();
162 FX_BOOL bNormal = FALSE;
163 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(i);
164 if (charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) {
165 bNormal = TRUE;
166 } else if (charinfo.m_Unicode == 0 || IsControlChar(charinfo)) {
167 bNormal = FALSE;
168 } else {
169 bNormal = TRUE;
170 }
171 if (bNormal) {
172 if (indexSize % 2) {
173 m_CharIndex.Add(1);
174 } else {
175 if (indexSize <= 0) {
176 continue;
177 }
178 m_CharIndex.SetAt(indexSize - 1,
179 m_CharIndex.GetAt(indexSize - 1) + 1);
180 }
181 } else {
182 if (indexSize % 2) {
183 if (indexSize <= 0) {
184 continue;
185 }
186 m_CharIndex.SetAt(indexSize - 1, i + 1);
187 } else {
188 m_CharIndex.Add(i + 1);
189 }
190 }
191 }
192 int indexSize = m_CharIndex.GetSize();
193 if (indexSize % 2) {
194 m_CharIndex.RemoveAt(indexSize - 1);
195 }
196 }
197 return TRUE;
198 }
CountChars() const199 int CPDF_TextPage::CountChars() const {
200 if (m_ParseOptions.m_bGetCharCodeOnly) {
201 return m_TextBuf.GetSize();
202 }
203 return m_charList.GetSize();
204 }
CharIndexFromTextIndex(int TextIndex) const205 int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const {
206 int indexSize = m_CharIndex.GetSize();
207 int count = 0;
208 for (int i = 0; i < indexSize; i += 2) {
209 count += m_CharIndex.GetAt(i + 1);
210 if (count > TextIndex) {
211 return TextIndex - count + m_CharIndex.GetAt(i + 1) +
212 m_CharIndex.GetAt(i);
213 }
214 }
215 return -1;
216 }
TextIndexFromCharIndex(int CharIndex) const217 int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const {
218 int indexSize = m_CharIndex.GetSize();
219 int count = 0;
220 for (int i = 0; i < indexSize; i += 2) {
221 count += m_CharIndex.GetAt(i + 1);
222 if (m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i) > CharIndex) {
223 if (CharIndex - m_CharIndex.GetAt(i) < 0) {
224 return -1;
225 }
226 return CharIndex - m_CharIndex.GetAt(i) + count -
227 m_CharIndex.GetAt(i + 1);
228 }
229 }
230 return -1;
231 }
GetRectArray(int start,int nCount,CFX_RectArray & rectArray) const232 void CPDF_TextPage::GetRectArray(int start,
233 int nCount,
234 CFX_RectArray& rectArray) const {
235 if (m_ParseOptions.m_bGetCharCodeOnly) {
236 return;
237 }
238 if (start < 0 || nCount == 0) {
239 return;
240 }
241 if (!m_bIsParsed) {
242 return;
243 }
244 PAGECHAR_INFO info_curchar;
245 CPDF_TextObject* pCurObj = NULL;
246 CFX_FloatRect rect;
247 int curPos = start;
248 FX_BOOL flagNewRect = TRUE;
249 if (nCount + start > m_charList.GetSize() || nCount == -1) {
250 nCount = m_charList.GetSize() - start;
251 }
252 while (nCount--) {
253 info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(curPos++);
254 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
255 continue;
256 }
257 if (info_curchar.m_CharBox.Width() < 0.01 ||
258 info_curchar.m_CharBox.Height() < 0.01) {
259 continue;
260 }
261 if (!pCurObj) {
262 pCurObj = info_curchar.m_pTextObj;
263 }
264 if (pCurObj != info_curchar.m_pTextObj) {
265 rectArray.Add(rect);
266 pCurObj = info_curchar.m_pTextObj;
267 flagNewRect = TRUE;
268 }
269 if (flagNewRect) {
270 FX_FLOAT orgX = info_curchar.m_OriginX, orgY = info_curchar.m_OriginY;
271 CFX_Matrix matrix, matrix_reverse;
272 info_curchar.m_pTextObj->GetTextMatrix(&matrix);
273 matrix.Concat(info_curchar.m_Matrix);
274 matrix_reverse.SetReverse(matrix);
275 matrix_reverse.Transform(orgX, orgY);
276 rect.left = info_curchar.m_CharBox.left;
277 rect.right = info_curchar.m_CharBox.right;
278 if (pCurObj->GetFont()->GetTypeDescent()) {
279 rect.bottom = orgY +
280 pCurObj->GetFont()->GetTypeDescent() *
281 pCurObj->GetFontSize() / 1000;
282 FX_FLOAT xPosTemp = orgX;
283 matrix.Transform(xPosTemp, rect.bottom);
284 } else {
285 rect.bottom = info_curchar.m_CharBox.bottom;
286 }
287 if (pCurObj->GetFont()->GetTypeAscent()) {
288 rect.top =
289 orgY +
290 pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000;
291 FX_FLOAT xPosTemp =
292 orgX +
293 GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) *
294 pCurObj->GetFontSize() / 1000;
295 matrix.Transform(xPosTemp, rect.top);
296 } else {
297 rect.top = info_curchar.m_CharBox.top;
298 }
299 flagNewRect = FALSE;
300 rect = info_curchar.m_CharBox;
301 rect.Normalize();
302 } else {
303 info_curchar.m_CharBox.Normalize();
304 if (rect.left > info_curchar.m_CharBox.left) {
305 rect.left = info_curchar.m_CharBox.left;
306 }
307 if (rect.right < info_curchar.m_CharBox.right) {
308 rect.right = info_curchar.m_CharBox.right;
309 }
310 if (rect.top < info_curchar.m_CharBox.top) {
311 rect.top = info_curchar.m_CharBox.top;
312 }
313 if (rect.bottom > info_curchar.m_CharBox.bottom) {
314 rect.bottom = info_curchar.m_CharBox.bottom;
315 }
316 }
317 }
318 rectArray.Add(rect);
319 return;
320 }
GetIndexAtPos(CPDF_Point point,FX_FLOAT xTolerance,FX_FLOAT yTolerance) const321 int CPDF_TextPage::GetIndexAtPos(CPDF_Point point,
322 FX_FLOAT xTolerance,
323 FX_FLOAT yTolerance) const {
324 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
325 return -3;
326
327 int pos = 0;
328 int NearPos = -1;
329 double xdif = 5000, ydif = 5000;
330 while (pos < m_charList.GetSize()) {
331 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)(m_charList.GetAt(pos));
332 CFX_FloatRect charrect = charinfo.m_CharBox;
333 if (charrect.Contains(point.x, point.y)) {
334 break;
335 }
336 if (xTolerance > 0 || yTolerance > 0) {
337 CFX_FloatRect charRectExt;
338 charrect.Normalize();
339 charRectExt.left = charrect.left - xTolerance / 2;
340 charRectExt.right = charrect.right + xTolerance / 2;
341 charRectExt.top = charrect.top + yTolerance / 2;
342 charRectExt.bottom = charrect.bottom - yTolerance / 2;
343 if (charRectExt.Contains(point.x, point.y)) {
344 double curXdif, curYdif;
345 curXdif = FXSYS_fabs(point.x - charrect.left) <
346 FXSYS_fabs(point.x - charrect.right)
347 ? FXSYS_fabs(point.x - charrect.left)
348 : FXSYS_fabs(point.x - charrect.right);
349 curYdif = FXSYS_fabs(point.y - charrect.bottom) <
350 FXSYS_fabs(point.y - charrect.top)
351 ? FXSYS_fabs(point.y - charrect.bottom)
352 : FXSYS_fabs(point.y - charrect.top);
353 if (curYdif + curXdif < xdif + ydif) {
354 ydif = curYdif;
355 xdif = curXdif;
356 NearPos = pos;
357 }
358 }
359 }
360 ++pos;
361 }
362 if (pos >= m_charList.GetSize()) {
363 pos = NearPos;
364 }
365 return pos;
366 }
GetTextByRect(const CFX_FloatRect & rect) const367 CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
368 CFX_WideString strText;
369 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
370 return strText;
371
372 int nCount = m_charList.GetSize();
373 int pos = 0;
374 FX_FLOAT posy = 0;
375 FX_BOOL IsContainPreChar = FALSE;
376 FX_BOOL ISAddLineFeed = FALSE;
377 while (pos < nCount) {
378 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
379 if (IsRectIntersect(rect, charinfo.m_CharBox)) {
380 if (FXSYS_fabs(posy - charinfo.m_OriginY) > 0 && !IsContainPreChar &&
381 ISAddLineFeed) {
382 posy = charinfo.m_OriginY;
383 if (strText.GetLength() > 0) {
384 strText += L"\r\n";
385 }
386 }
387 IsContainPreChar = TRUE;
388 ISAddLineFeed = FALSE;
389 if (charinfo.m_Unicode) {
390 strText += charinfo.m_Unicode;
391 }
392 } else if (charinfo.m_Unicode == 32) {
393 if (IsContainPreChar && charinfo.m_Unicode) {
394 strText += charinfo.m_Unicode;
395 IsContainPreChar = FALSE;
396 ISAddLineFeed = FALSE;
397 }
398 } else {
399 IsContainPreChar = FALSE;
400 ISAddLineFeed = TRUE;
401 }
402 }
403 return strText;
404 }
GetRectsArrayByRect(const CFX_FloatRect & rect,CFX_RectArray & resRectArray) const405 void CPDF_TextPage::GetRectsArrayByRect(const CFX_FloatRect& rect,
406 CFX_RectArray& resRectArray) const {
407 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
408 return;
409
410 CFX_FloatRect curRect;
411 FX_BOOL flagNewRect = TRUE;
412 CPDF_TextObject* pCurObj = NULL;
413 int nCount = m_charList.GetSize();
414 int pos = 0;
415 while (pos < nCount) {
416 PAGECHAR_INFO info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
417 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
418 continue;
419 }
420 if (IsRectIntersect(rect, info_curchar.m_CharBox)) {
421 if (!pCurObj) {
422 pCurObj = info_curchar.m_pTextObj;
423 }
424 if (pCurObj != info_curchar.m_pTextObj) {
425 resRectArray.Add(curRect);
426 pCurObj = info_curchar.m_pTextObj;
427 flagNewRect = TRUE;
428 }
429 if (flagNewRect) {
430 curRect = info_curchar.m_CharBox;
431 flagNewRect = FALSE;
432 curRect.Normalize();
433 } else {
434 info_curchar.m_CharBox.Normalize();
435 if (curRect.left > info_curchar.m_CharBox.left) {
436 curRect.left = info_curchar.m_CharBox.left;
437 }
438 if (curRect.right < info_curchar.m_CharBox.right) {
439 curRect.right = info_curchar.m_CharBox.right;
440 }
441 if (curRect.top < info_curchar.m_CharBox.top) {
442 curRect.top = info_curchar.m_CharBox.top;
443 }
444 if (curRect.bottom > info_curchar.m_CharBox.bottom) {
445 curRect.bottom = info_curchar.m_CharBox.bottom;
446 }
447 }
448 }
449 }
450 resRectArray.Add(curRect);
451 return;
452 }
GetIndexAtPos(FX_FLOAT x,FX_FLOAT y,FX_FLOAT xTolerance,FX_FLOAT yTolerance) const453 int CPDF_TextPage::GetIndexAtPos(FX_FLOAT x,
454 FX_FLOAT y,
455 FX_FLOAT xTolerance,
456 FX_FLOAT yTolerance) const {
457 if (m_ParseOptions.m_bGetCharCodeOnly) {
458 return -3;
459 }
460 CPDF_Point point(x, y);
461 return GetIndexAtPos(point, xTolerance, yTolerance);
462 }
463
GetCharInfo(int index,FPDF_CHAR_INFO * info) const464 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const {
465 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
466 return;
467
468 if (index < 0 || index >= m_charList.GetSize())
469 return;
470
471 const PAGECHAR_INFO* charinfo =
472 static_cast<PAGECHAR_INFO*>(m_charList.GetAt(index));
473 info->m_Charcode = charinfo->m_CharCode;
474 info->m_OriginX = charinfo->m_OriginX;
475 info->m_OriginY = charinfo->m_OriginY;
476 info->m_Unicode = charinfo->m_Unicode;
477 info->m_Flag = charinfo->m_Flag;
478 info->m_CharBox = charinfo->m_CharBox;
479 info->m_pTextObj = charinfo->m_pTextObj;
480 if (charinfo->m_pTextObj && charinfo->m_pTextObj->GetFont()) {
481 info->m_FontSize = charinfo->m_pTextObj->GetFontSize();
482 } else {
483 info->m_FontSize = kDefaultFontSize;
484 }
485 info->m_Matrix.Copy(charinfo->m_Matrix);
486 }
487
CheckMarkedContentObject(int32_t & start,int32_t & nCount) const488 void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
489 int32_t& nCount) const {
490 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
491 PAGECHAR_INFO charinfo2 =
492 *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
493 if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag &&
494 FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) {
495 return;
496 }
497 if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) {
498 PAGECHAR_INFO charinfo1 = charinfo;
499 int startIndex = start;
500 while (FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag &&
501 charinfo1.m_Index == charinfo.m_Index) {
502 startIndex--;
503 if (startIndex < 0) {
504 break;
505 }
506 charinfo1 = *(PAGECHAR_INFO*)m_charList.GetAt(startIndex);
507 }
508 startIndex++;
509 start = startIndex;
510 }
511 if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) {
512 PAGECHAR_INFO charinfo3 = charinfo2;
513 int endIndex = start + nCount - 1;
514 while (FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag &&
515 charinfo3.m_Index == charinfo2.m_Index) {
516 endIndex++;
517 if (endIndex >= m_charList.GetSize()) {
518 break;
519 }
520 charinfo3 = *(PAGECHAR_INFO*)m_charList.GetAt(endIndex);
521 }
522 endIndex--;
523 nCount = endIndex - start + 1;
524 }
525 }
GetPageText(int start,int nCount) const526 CFX_WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
527 if (!m_bIsParsed || nCount == 0)
528 return L"";
529
530 if (start < 0)
531 start = 0;
532
533 if (nCount == -1) {
534 nCount = m_charList.GetSize() - start;
535 return m_TextBuf.GetWideString().Mid(start,
536 m_TextBuf.GetWideString().GetLength());
537 }
538 if (nCount <= 0 || m_charList.GetSize() <= 0) {
539 return L"";
540 }
541 if (nCount + start > m_charList.GetSize() - 1) {
542 nCount = m_charList.GetSize() - start;
543 }
544 if (nCount <= 0) {
545 return L"";
546 }
547 CheckMarkedContentObject(start, nCount);
548 int startindex = 0;
549 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
550 int startOffset = 0;
551 while (charinfo.m_Index == -1) {
552 startOffset++;
553 if (startOffset > nCount || start + startOffset >= m_charList.GetSize()) {
554 return L"";
555 }
556 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + startOffset);
557 }
558 startindex = charinfo.m_Index;
559 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
560 int nCountOffset = 0;
561 while (charinfo.m_Index == -1) {
562 nCountOffset++;
563 if (nCountOffset >= nCount) {
564 return L"";
565 }
566 charinfo =
567 *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - nCountOffset - 1);
568 }
569 nCount = start + nCount - nCountOffset - startindex;
570 if (nCount <= 0) {
571 return L"";
572 }
573 return m_TextBuf.GetWideString().Mid(startindex, nCount);
574 }
CountRects(int start,int nCount)575 int CPDF_TextPage::CountRects(int start, int nCount) {
576 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed || start < 0)
577 return -1;
578
579 if (nCount == -1 || nCount + start > m_charList.GetSize()) {
580 nCount = m_charList.GetSize() - start;
581 }
582 m_SelRects.RemoveAll();
583 GetRectArray(start, nCount, m_SelRects);
584 return m_SelRects.GetSize();
585 }
GetRect(int rectIndex,FX_FLOAT & left,FX_FLOAT & top,FX_FLOAT & right,FX_FLOAT & bottom) const586 void CPDF_TextPage::GetRect(int rectIndex,
587 FX_FLOAT& left,
588 FX_FLOAT& top,
589 FX_FLOAT& right,
590 FX_FLOAT& bottom) const {
591 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
592 return;
593
594 if (rectIndex < 0 || rectIndex >= m_SelRects.GetSize())
595 return;
596
597 left = m_SelRects.GetAt(rectIndex).left;
598 top = m_SelRects.GetAt(rectIndex).top;
599 right = m_SelRects.GetAt(rectIndex).right;
600 bottom = m_SelRects.GetAt(rectIndex).bottom;
601 }
602
GetBaselineRotate(int start,int end,int & Rotate)603 FX_BOOL CPDF_TextPage::GetBaselineRotate(int start, int end, int& Rotate) {
604 if (m_ParseOptions.m_bGetCharCodeOnly) {
605 return FALSE;
606 }
607 if (end == start) {
608 return FALSE;
609 }
610 FPDF_CHAR_INFO info_start;
611 FPDF_CHAR_INFO info_end;
612 GetCharInfo(start, &info_start);
613 GetCharInfo(end, &info_end);
614 while (info_end.m_CharBox.Width() == 0 || info_end.m_CharBox.Height() == 0) {
615 if (--end <= start)
616 return FALSE;
617
618 GetCharInfo(end, &info_end);
619 }
620 FX_FLOAT dx = (info_end.m_OriginX - info_start.m_OriginX);
621 FX_FLOAT dy = (info_end.m_OriginY - info_start.m_OriginY);
622 if (dx == 0) {
623 if (dy > 0) {
624 Rotate = 90;
625 } else if (dy < 0) {
626 Rotate = 270;
627 } else {
628 Rotate = 0;
629 }
630 } else {
631 float a = FXSYS_atan2(dy, dx);
632 Rotate = (int)(a * 180 / FX_PI + 0.5);
633 }
634 if (Rotate < 0) {
635 Rotate = -Rotate;
636 } else if (Rotate > 0) {
637 Rotate = 360 - Rotate;
638 }
639 return TRUE;
640 }
641
GetBaselineRotate(const CFX_FloatRect & rect,int & Rotate)642 FX_BOOL CPDF_TextPage::GetBaselineRotate(const CFX_FloatRect& rect,
643 int& Rotate) {
644 if (m_ParseOptions.m_bGetCharCodeOnly) {
645 return FALSE;
646 }
647 int start, end, count,
648 n = CountBoundedSegments(rect.left, rect.top, rect.right, rect.bottom,
649 TRUE);
650 if (n < 1) {
651 return FALSE;
652 }
653 if (n > 1) {
654 GetBoundedSegment(n - 1, start, count);
655 end = start + count - 1;
656 GetBoundedSegment(0, start, count);
657 } else {
658 GetBoundedSegment(0, start, count);
659 end = start + count - 1;
660 }
661 return GetBaselineRotate(start, end, Rotate);
662 }
GetBaselineRotate(int rectIndex,int & Rotate)663 FX_BOOL CPDF_TextPage::GetBaselineRotate(int rectIndex, int& Rotate) {
664 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
665 return FALSE;
666
667 if (rectIndex < 0 || rectIndex > m_SelRects.GetSize())
668 return FALSE;
669
670 CFX_FloatRect rect = m_SelRects.GetAt(rectIndex);
671 return GetBaselineRotate(rect, Rotate);
672 }
CountBoundedSegments(FX_FLOAT left,FX_FLOAT top,FX_FLOAT right,FX_FLOAT bottom,FX_BOOL bContains)673 int CPDF_TextPage::CountBoundedSegments(FX_FLOAT left,
674 FX_FLOAT top,
675 FX_FLOAT right,
676 FX_FLOAT bottom,
677 FX_BOOL bContains) {
678 if (m_ParseOptions.m_bGetCharCodeOnly)
679 return -1;
680
681 m_Segment.RemoveAll();
682 if (!m_bIsParsed)
683 return -1;
684
685 CFX_FloatRect rect(left, bottom, right, top);
686 rect.Normalize();
687 int nCount = m_charList.GetSize();
688 int pos = 0;
689 FPDF_SEGMENT segment;
690 segment.m_Start = 0;
691 segment.m_nCount = 0;
692 int segmentStatus = 0;
693 FX_BOOL IsContainPreChar = FALSE;
694 while (pos < nCount) {
695 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos);
696 if (bContains && rect.Contains(charinfo.m_CharBox)) {
697 if (segmentStatus == 0 || segmentStatus == 2) {
698 segment.m_Start = pos;
699 segment.m_nCount = 1;
700 segmentStatus = 1;
701 } else if (segmentStatus == 1) {
702 segment.m_nCount++;
703 }
704 IsContainPreChar = TRUE;
705 } else if (!bContains &&
706 (IsRectIntersect(rect, charinfo.m_CharBox) ||
707 rect.Contains(charinfo.m_OriginX, charinfo.m_OriginY))) {
708 if (segmentStatus == 0 || segmentStatus == 2) {
709 segment.m_Start = pos;
710 segment.m_nCount = 1;
711 segmentStatus = 1;
712 } else if (segmentStatus == 1) {
713 segment.m_nCount++;
714 }
715 IsContainPreChar = TRUE;
716 } else if (charinfo.m_Unicode == 32) {
717 if (IsContainPreChar == TRUE) {
718 if (segmentStatus == 0 || segmentStatus == 2) {
719 segment.m_Start = pos;
720 segment.m_nCount = 1;
721 segmentStatus = 1;
722 } else if (segmentStatus == 1) {
723 segment.m_nCount++;
724 }
725 IsContainPreChar = FALSE;
726 } else {
727 if (segmentStatus == 1) {
728 segmentStatus = 2;
729 m_Segment.Add(segment);
730 segment.m_Start = 0;
731 segment.m_nCount = 0;
732 }
733 }
734 } else {
735 if (segmentStatus == 1) {
736 segmentStatus = 2;
737 m_Segment.Add(segment);
738 segment.m_Start = 0;
739 segment.m_nCount = 0;
740 }
741 IsContainPreChar = FALSE;
742 }
743 pos++;
744 }
745 if (segmentStatus == 1) {
746 segmentStatus = 2;
747 m_Segment.Add(segment);
748 segment.m_Start = 0;
749 segment.m_nCount = 0;
750 }
751 return m_Segment.GetSize();
752 }
GetBoundedSegment(int index,int & start,int & count) const753 void CPDF_TextPage::GetBoundedSegment(int index, int& start, int& count) const {
754 if (m_ParseOptions.m_bGetCharCodeOnly) {
755 return;
756 }
757 if (index < 0 || index >= m_Segment.GetSize()) {
758 return;
759 }
760 start = m_Segment.GetAt(index).m_Start;
761 count = m_Segment.GetAt(index).m_nCount;
762 }
GetWordBreak(int index,int direction) const763 int CPDF_TextPage::GetWordBreak(int index, int direction) const {
764 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
765 return -1;
766
767 if (direction != FPDFTEXT_LEFT && direction != FPDFTEXT_RIGHT)
768 return -1;
769
770 if (index < 0 || index >= m_charList.GetSize())
771 return -1;
772
773 PAGECHAR_INFO charinfo;
774 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
775 if (charinfo.m_Index == -1 || charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) {
776 return index;
777 }
778 if (!IsLetter(charinfo.m_Unicode)) {
779 return index;
780 }
781 int breakPos = index;
782 if (direction == FPDFTEXT_LEFT) {
783 while (--breakPos > 0) {
784 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos);
785 if (!IsLetter(charinfo.m_Unicode)) {
786 return breakPos;
787 }
788 }
789 } else if (direction == FPDFTEXT_RIGHT) {
790 while (++breakPos < m_charList.GetSize()) {
791 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos);
792 if (!IsLetter(charinfo.m_Unicode)) {
793 return breakPos;
794 }
795 }
796 }
797 return breakPos;
798 }
FindTextlineFlowDirection()799 int32_t CPDF_TextPage::FindTextlineFlowDirection() {
800 if (!m_pPage) {
801 return -1;
802 }
803 const int32_t nPageWidth = (int32_t)((CPDF_Page*)m_pPage)->GetPageWidth();
804 const int32_t nPageHeight = (int32_t)((CPDF_Page*)m_pPage)->GetPageHeight();
805 CFX_ByteArray nHorizontalMask;
806 if (!nHorizontalMask.SetSize(nPageWidth)) {
807 return -1;
808 }
809 uint8_t* pDataH = nHorizontalMask.GetData();
810 CFX_ByteArray nVerticalMask;
811 if (!nVerticalMask.SetSize(nPageHeight)) {
812 return -1;
813 }
814 uint8_t* pDataV = nVerticalMask.GetData();
815 int32_t index = 0;
816 FX_FLOAT fLineHeight = 0.0f;
817 CPDF_PageObject* pPageObj = NULL;
818 FX_POSITION pos = NULL;
819 pos = m_pPage->GetFirstObjectPosition();
820 if (!pos) {
821 return -1;
822 }
823 while (pos) {
824 pPageObj = m_pPage->GetNextObject(pos);
825 if (NULL == pPageObj) {
826 continue;
827 }
828 if (PDFPAGE_TEXT != pPageObj->m_Type) {
829 continue;
830 }
831 int32_t minH =
832 (int32_t)pPageObj->m_Left < 0 ? 0 : (int32_t)pPageObj->m_Left;
833 int32_t maxH = (int32_t)pPageObj->m_Right > nPageWidth
834 ? nPageWidth
835 : (int32_t)pPageObj->m_Right;
836 int32_t minV =
837 (int32_t)pPageObj->m_Bottom < 0 ? 0 : (int32_t)pPageObj->m_Bottom;
838 int32_t maxV = (int32_t)pPageObj->m_Top > nPageHeight
839 ? nPageHeight
840 : (int32_t)pPageObj->m_Top;
841 if (minH >= maxH || minV >= maxV) {
842 continue;
843 }
844 FXSYS_memset(pDataH + minH, 1, maxH - minH);
845 FXSYS_memset(pDataV + minV, 1, maxV - minV);
846 if (fLineHeight <= 0.0f) {
847 fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom;
848 }
849 pPageObj = NULL;
850 }
851 int32_t nStartH = 0;
852 int32_t nEndH = 0;
853 FX_FLOAT nSumH = 0.0f;
854 for (index = 0; index < nPageWidth; index++)
855 if (1 == nHorizontalMask[index]) {
856 break;
857 }
858 nStartH = index;
859 for (index = nPageWidth; index > 0; index--)
860 if (1 == nHorizontalMask[index - 1]) {
861 break;
862 }
863 nEndH = index;
864 for (index = nStartH; index < nEndH; index++) {
865 nSumH += nHorizontalMask[index];
866 }
867 nSumH /= nEndH - nStartH;
868 int32_t nStartV = 0;
869 int32_t nEndV = 0;
870 FX_FLOAT nSumV = 0.0f;
871 for (index = 0; index < nPageHeight; index++)
872 if (1 == nVerticalMask[index]) {
873 break;
874 }
875 nStartV = index;
876 for (index = nPageHeight; index > 0; index--)
877 if (1 == nVerticalMask[index - 1]) {
878 break;
879 }
880 nEndV = index;
881 for (index = nStartV; index < nEndV; index++) {
882 nSumV += nVerticalMask[index];
883 }
884 nSumV /= nEndV - nStartV;
885 if ((nEndV - nStartV) < (int32_t)(2 * fLineHeight)) {
886 return 0;
887 }
888 if ((nEndH - nStartH) < (int32_t)(2 * fLineHeight)) {
889 return 1;
890 }
891 if (nSumH > 0.8f) {
892 return 0;
893 }
894 if (nSumH - nSumV > 0.0f) {
895 return 0;
896 }
897 if (nSumV - nSumH > 0.0f) {
898 return 1;
899 }
900 return -1;
901 }
ProcessObject()902 void CPDF_TextPage::ProcessObject() {
903 CPDF_PageObject* pPageObj = NULL;
904 if (!m_pPage) {
905 return;
906 }
907 FX_POSITION pos;
908 pos = m_pPage->GetFirstObjectPosition();
909 if (!pos) {
910 return;
911 }
912 m_TextlineDir = FindTextlineFlowDirection();
913 int nCount = 0;
914 while (pos) {
915 pPageObj = m_pPage->GetNextObject(pos);
916 if (pPageObj) {
917 if (pPageObj->m_Type == PDFPAGE_TEXT) {
918 CFX_Matrix matrix;
919 ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos);
920 nCount++;
921 } else if (pPageObj->m_Type == PDFPAGE_FORM) {
922 CFX_Matrix formMatrix(1, 0, 0, 1, 0, 0);
923 ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix);
924 }
925 }
926 pPageObj = NULL;
927 }
928 int count = m_LineObj.GetSize();
929 for (int i = 0; i < count; i++) {
930 ProcessTextObject(m_LineObj.GetAt(i));
931 }
932 m_LineObj.RemoveAll();
933 CloseTempLine();
934 }
ProcessFormObject(CPDF_FormObject * pFormObj,const CFX_Matrix & formMatrix)935 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj,
936 const CFX_Matrix& formMatrix) {
937 CPDF_PageObject* pPageObj = NULL;
938 FX_POSITION pos;
939 if (!pFormObj) {
940 return;
941 }
942 pos = pFormObj->m_pForm->GetFirstObjectPosition();
943 if (!pos) {
944 return;
945 }
946 CFX_Matrix curFormMatrix;
947 curFormMatrix.Copy(pFormObj->m_FormMatrix);
948 curFormMatrix.Concat(formMatrix);
949 while (pos) {
950 pPageObj = pFormObj->m_pForm->GetNextObject(pos);
951 if (pPageObj) {
952 if (pPageObj->m_Type == PDFPAGE_TEXT) {
953 ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos);
954 } else if (pPageObj->m_Type == PDFPAGE_FORM) {
955 ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix);
956 }
957 }
958 pPageObj = NULL;
959 }
960 }
GetCharWidth(FX_DWORD charCode,CPDF_Font * pFont) const961 int CPDF_TextPage::GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const {
962 if (charCode == -1) {
963 return 0;
964 }
965 int w = pFont->GetCharWidthF(charCode);
966 if (w == 0) {
967 CFX_ByteString str;
968 pFont->AppendChar(str, charCode);
969 w = pFont->GetStringWidth(str, 1);
970 if (w == 0) {
971 FX_RECT BBox;
972 pFont->GetCharBBox(charCode, BBox);
973 w = BBox.right - BBox.left;
974 }
975 }
976 return w;
977 }
OnPiece(CFX_BidiChar * pBidi,CFX_WideString & str)978 void CPDF_TextPage::OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str) {
979 int32_t start, count;
980 CFX_BidiChar::Direction ret = pBidi->GetBidiInfo(&start, &count);
981 if (ret == CFX_BidiChar::RIGHT) {
982 for (int i = start + count - 1; i >= start; i--) {
983 m_TextBuf.AppendChar(str.GetAt(i));
984 m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
985 }
986 } else {
987 int end = start + count;
988 for (int i = start; i < end; i++) {
989 m_TextBuf.AppendChar(str.GetAt(i));
990 m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
991 }
992 }
993 }
AddCharInfoByLRDirection(CFX_WideString & str,int i)994 void CPDF_TextPage::AddCharInfoByLRDirection(CFX_WideString& str, int i) {
995 PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
996 FX_WCHAR wChar = str.GetAt(i);
997 if (!IsControlChar(Info)) {
998 Info.m_Index = m_TextBuf.GetLength();
999 if (wChar >= 0xFB00 && wChar <= 0xFB06) {
1000 FX_WCHAR* pDst = NULL;
1001 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
1002 if (nCount >= 1) {
1003 pDst = FX_Alloc(FX_WCHAR, nCount);
1004 FX_Unicode_GetNormalization(wChar, pDst);
1005 for (int nIndex = 0; nIndex < nCount; nIndex++) {
1006 PAGECHAR_INFO Info2 = Info;
1007 Info2.m_Unicode = pDst[nIndex];
1008 Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
1009 m_TextBuf.AppendChar(Info2.m_Unicode);
1010 if (!m_ParseOptions.m_bGetCharCodeOnly) {
1011 m_charList.Add(Info2);
1012 }
1013 }
1014 FX_Free(pDst);
1015 return;
1016 }
1017 }
1018 m_TextBuf.AppendChar(wChar);
1019 } else {
1020 Info.m_Index = -1;
1021 }
1022 if (!m_ParseOptions.m_bGetCharCodeOnly) {
1023 m_charList.Add(Info);
1024 }
1025 }
AddCharInfoByRLDirection(CFX_WideString & str,int i)1026 void CPDF_TextPage::AddCharInfoByRLDirection(CFX_WideString& str, int i) {
1027 PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
1028 if (!IsControlChar(Info)) {
1029 Info.m_Index = m_TextBuf.GetLength();
1030 FX_WCHAR wChar = FX_GetMirrorChar(str.GetAt(i), TRUE, FALSE);
1031 FX_WCHAR* pDst = NULL;
1032 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
1033 if (nCount >= 1) {
1034 pDst = FX_Alloc(FX_WCHAR, nCount);
1035 FX_Unicode_GetNormalization(wChar, pDst);
1036 for (int nIndex = 0; nIndex < nCount; nIndex++) {
1037 PAGECHAR_INFO Info2 = Info;
1038 Info2.m_Unicode = pDst[nIndex];
1039 Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
1040 m_TextBuf.AppendChar(Info2.m_Unicode);
1041 if (!m_ParseOptions.m_bGetCharCodeOnly) {
1042 m_charList.Add(Info2);
1043 }
1044 }
1045 FX_Free(pDst);
1046 return;
1047 }
1048 Info.m_Unicode = wChar;
1049 m_TextBuf.AppendChar(Info.m_Unicode);
1050 } else {
1051 Info.m_Index = -1;
1052 }
1053 if (!m_ParseOptions.m_bGetCharCodeOnly) {
1054 m_charList.Add(Info);
1055 }
1056 }
CloseTempLine()1057 void CPDF_TextPage::CloseTempLine() {
1058 int count1 = m_TempCharList.GetSize();
1059 if (count1 <= 0) {
1060 return;
1061 }
1062 std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar);
1063 CFX_WideString str = m_TempTextBuf.GetWideString();
1064 CFX_WordArray order;
1065 FX_BOOL bR2L = FALSE;
1066 int32_t start = 0, count = 0;
1067 int nR2L = 0, nL2R = 0;
1068 FX_BOOL bPrevSpace = FALSE;
1069 for (int i = 0; i < str.GetLength(); i++) {
1070 if (str.GetAt(i) == 32) {
1071 if (bPrevSpace) {
1072 m_TempTextBuf.Delete(i, 1);
1073 m_TempCharList.Delete(i);
1074 str.Delete(i);
1075 count1--;
1076 i--;
1077 continue;
1078 }
1079 bPrevSpace = TRUE;
1080 } else {
1081 bPrevSpace = FALSE;
1082 }
1083 if (pBidiChar->AppendChar(str.GetAt(i))) {
1084 CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
1085 order.Add(start);
1086 order.Add(count);
1087 order.Add(ret);
1088 if (!bR2L) {
1089 if (ret == CFX_BidiChar::RIGHT) {
1090 nR2L++;
1091 } else if (ret == CFX_BidiChar::LEFT) {
1092 nL2R++;
1093 }
1094 }
1095 }
1096 }
1097 if (pBidiChar->EndChar()) {
1098 CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
1099 order.Add(start);
1100 order.Add(count);
1101 order.Add(ret);
1102 if (!bR2L) {
1103 if (ret == CFX_BidiChar::RIGHT) {
1104 nR2L++;
1105 } else if (ret == CFX_BidiChar::LEFT) {
1106 nL2R++;
1107 }
1108 }
1109 }
1110 if (nR2L > 0 && nR2L >= nL2R) {
1111 bR2L = TRUE;
1112 }
1113 if (m_parserflag == FPDFTEXT_RLTB || bR2L) {
1114 int count = order.GetSize();
1115 for (int i = count - 1; i > 0; i -= 3) {
1116 int ret = order.GetAt(i);
1117 int start = order.GetAt(i - 2);
1118 int count1 = order.GetAt(i - 1);
1119 if (ret == 2 || ret == 0) {
1120 for (int j = start + count1 - 1; j >= start; j--) {
1121 AddCharInfoByRLDirection(str, j);
1122 }
1123 } else {
1124 int j = i;
1125 FX_BOOL bSymbol = FALSE;
1126 while (j > 0 && order.GetAt(j) != 2) {
1127 bSymbol = !order.GetAt(j);
1128 j -= 3;
1129 }
1130 int end = start + count1;
1131 int n = 0;
1132 if (bSymbol) {
1133 n = j + 6;
1134 } else {
1135 n = j + 3;
1136 }
1137 if (n >= i) {
1138 for (int m = start; m < end; m++) {
1139 AddCharInfoByLRDirection(str, m);
1140 }
1141 } else {
1142 j = i;
1143 i = n;
1144 for (; n <= j; n += 3) {
1145 int start = order.GetAt(n - 2);
1146 int count1 = order.GetAt(n - 1);
1147 int end = start + count1;
1148 for (int m = start; m < end; m++) {
1149 AddCharInfoByLRDirection(str, m);
1150 }
1151 }
1152 }
1153 }
1154 }
1155 } else {
1156 int count = order.GetSize();
1157 FX_BOOL bL2R = FALSE;
1158 for (int i = 0; i < count; i += 3) {
1159 int ret = order.GetAt(i + 2);
1160 int start = order.GetAt(i);
1161 int count1 = order.GetAt(i + 1);
1162 if (ret == 2 || (i == 0 && ret == 0 && !bL2R)) {
1163 int j = i + 3;
1164 while (bR2L && j < count) {
1165 if (order.GetAt(j + 2) == 1) {
1166 break;
1167 } else {
1168 j += 3;
1169 }
1170 }
1171 if (j == 3) {
1172 i = -3;
1173 bL2R = TRUE;
1174 continue;
1175 }
1176 int end = m_TempCharList.GetSize() - 1;
1177 if (j < count) {
1178 end = order.GetAt(j) - 1;
1179 }
1180 i = j - 3;
1181 for (int n = end; n >= start; n--) {
1182 AddCharInfoByRLDirection(str, n);
1183 }
1184 } else {
1185 int end = start + count1;
1186 for (int n = start; n < end; n++) {
1187 AddCharInfoByLRDirection(str, n);
1188 }
1189 }
1190 }
1191 }
1192 order.RemoveAll();
1193 m_TempCharList.RemoveAll();
1194 m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
1195 }
ProcessTextObject(CPDF_TextObject * pTextObj,const CFX_Matrix & formMatrix,FX_POSITION ObjPos)1196 void CPDF_TextPage::ProcessTextObject(CPDF_TextObject* pTextObj,
1197 const CFX_Matrix& formMatrix,
1198 FX_POSITION ObjPos) {
1199 CFX_FloatRect re(pTextObj->m_Left, pTextObj->m_Bottom, pTextObj->m_Right,
1200 pTextObj->m_Top);
1201 if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) {
1202 return;
1203 }
1204 int count = m_LineObj.GetSize();
1205 PDFTEXT_Obj Obj;
1206 Obj.m_pTextObj = pTextObj;
1207 Obj.m_formMatrix = formMatrix;
1208 if (count == 0) {
1209 m_LineObj.Add(Obj);
1210 return;
1211 }
1212 if (IsSameAsPreTextObject(pTextObj, ObjPos)) {
1213 return;
1214 }
1215 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(count - 1);
1216 CPDF_TextObjectItem item;
1217 int nItem = prev_Obj.m_pTextObj->CountItems();
1218 prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item);
1219 FX_FLOAT prev_width =
1220 GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) *
1221 prev_Obj.m_pTextObj->GetFontSize() / 1000;
1222 CFX_Matrix prev_matrix;
1223 prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
1224 prev_width = FXSYS_fabs(prev_width);
1225 prev_matrix.Concat(prev_Obj.m_formMatrix);
1226 prev_width = prev_matrix.TransformDistance(prev_width);
1227 pTextObj->GetItemInfo(0, &item);
1228 FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) *
1229 pTextObj->GetFontSize() / 1000;
1230 this_width = FXSYS_fabs(this_width);
1231 CFX_Matrix this_matrix;
1232 pTextObj->GetTextMatrix(&this_matrix);
1233 this_width = FXSYS_fabs(this_width);
1234 this_matrix.Concat(formMatrix);
1235 this_width = this_matrix.TransformDistance(this_width);
1236 FX_FLOAT threshold =
1237 prev_width > this_width ? prev_width / 4 : this_width / 4;
1238 FX_FLOAT prev_x = prev_Obj.m_pTextObj->GetPosX(),
1239 prev_y = prev_Obj.m_pTextObj->GetPosY();
1240 prev_Obj.m_formMatrix.Transform(prev_x, prev_y);
1241 m_DisplayMatrix.Transform(prev_x, prev_y);
1242 FX_FLOAT this_x = pTextObj->GetPosX(), this_y = pTextObj->GetPosY();
1243 formMatrix.Transform(this_x, this_y);
1244 m_DisplayMatrix.Transform(this_x, this_y);
1245 if (FXSYS_fabs(this_y - prev_y) > threshold * 2) {
1246 for (int i = 0; i < count; i++) {
1247 ProcessTextObject(m_LineObj.GetAt(i));
1248 }
1249 m_LineObj.RemoveAll();
1250 m_LineObj.Add(Obj);
1251 return;
1252 }
1253 int i = 0;
1254 if (m_ParseOptions.m_bNormalizeObjs) {
1255 for (i = count - 1; i >= 0; i--) {
1256 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(i);
1257 CFX_Matrix prev_matrix;
1258 prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
1259 FX_FLOAT Prev_x = prev_Obj.m_pTextObj->GetPosX(),
1260 Prev_y = prev_Obj.m_pTextObj->GetPosY();
1261 prev_Obj.m_formMatrix.Transform(Prev_x, Prev_y);
1262 m_DisplayMatrix.Transform(Prev_x, Prev_y);
1263 if (this_x >= Prev_x) {
1264 if (i == count - 1) {
1265 m_LineObj.Add(Obj);
1266 } else {
1267 m_LineObj.InsertAt(i + 1, Obj);
1268 }
1269 break;
1270 }
1271 }
1272 if (i < 0) {
1273 m_LineObj.InsertAt(0, Obj);
1274 }
1275 } else {
1276 m_LineObj.Add(Obj);
1277 }
1278 }
PreMarkedContent(PDFTEXT_Obj Obj)1279 int32_t CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) {
1280 CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1281 CPDF_ContentMarkData* pMarkData =
1282 (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject();
1283 if (!pMarkData) {
1284 return FPDFTEXT_MC_PASS;
1285 }
1286 int nContentMark = pMarkData->CountItems();
1287 if (nContentMark < 1) {
1288 return FPDFTEXT_MC_PASS;
1289 }
1290 CFX_WideString actText;
1291 FX_BOOL bExist = FALSE;
1292 CPDF_Dictionary* pDict = NULL;
1293 int n = 0;
1294 for (n = 0; n < nContentMark; n++) {
1295 CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
1296 CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
1297 pDict = ToDictionary(static_cast<CPDF_Object*>(item.GetParam()));
1298 CPDF_String* temp =
1299 ToString(pDict ? pDict->GetElement("ActualText") : nullptr);
1300 if (temp) {
1301 bExist = TRUE;
1302 actText = temp->GetUnicodeText();
1303 }
1304 }
1305 if (!bExist) {
1306 return FPDFTEXT_MC_PASS;
1307 }
1308 if (m_pPreTextObj) {
1309 if (CPDF_ContentMarkData* pPreMarkData =
1310 (CPDF_ContentMarkData*)m_pPreTextObj->m_ContentMark.GetObject()) {
1311 if (pPreMarkData->CountItems() == n) {
1312 CPDF_ContentMarkItem& item = pPreMarkData->GetItem(n - 1);
1313 if (pDict == item.GetParam()) {
1314 return FPDFTEXT_MC_DONE;
1315 }
1316 }
1317 }
1318 }
1319 CPDF_Font* pFont = pTextObj->GetFont();
1320 FX_STRSIZE nItems = actText.GetLength();
1321 if (nItems < 1) {
1322 return FPDFTEXT_MC_PASS;
1323 }
1324 bExist = FALSE;
1325 for (FX_STRSIZE i = 0; i < nItems; i++) {
1326 FX_WCHAR wChar = actText.GetAt(i);
1327 if (-1 == pFont->CharCodeFromUnicode(wChar)) {
1328 continue;
1329 } else {
1330 bExist = TRUE;
1331 break;
1332 }
1333 }
1334 if (!bExist) {
1335 return FPDFTEXT_MC_PASS;
1336 }
1337 bExist = FALSE;
1338 for (FX_STRSIZE i = 0; i < nItems; i++) {
1339 FX_WCHAR wChar = actText.GetAt(i);
1340 if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
1341 bExist = TRUE;
1342 break;
1343 }
1344 }
1345 if (!bExist) {
1346 return FPDFTEXT_MC_DONE;
1347 }
1348 return FPDFTEXT_MC_DELAY;
1349 }
ProcessMarkedContent(PDFTEXT_Obj Obj)1350 void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) {
1351 CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1352 CPDF_ContentMarkData* pMarkData =
1353 (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject();
1354 if (!pMarkData) {
1355 return;
1356 }
1357 int nContentMark = pMarkData->CountItems();
1358 if (nContentMark < 1) {
1359 return;
1360 }
1361 CFX_WideString actText;
1362 CPDF_Dictionary* pDict = NULL;
1363 int n = 0;
1364 for (n = 0; n < nContentMark; n++) {
1365 CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
1366 CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
1367 pDict = ToDictionary(static_cast<CPDF_Object*>(item.GetParam()));
1368 CPDF_String* temp =
1369 ToString(pDict ? pDict->GetElement("ActualText") : nullptr);
1370 if (temp) {
1371 actText = temp->GetUnicodeText();
1372 }
1373 }
1374 FX_STRSIZE nItems = actText.GetLength();
1375 if (nItems < 1) {
1376 return;
1377 }
1378 CPDF_Font* pFont = pTextObj->GetFont();
1379 CFX_Matrix formMatrix = Obj.m_formMatrix;
1380 CFX_Matrix matrix;
1381 pTextObj->GetTextMatrix(&matrix);
1382 matrix.Concat(formMatrix);
1383 FX_FLOAT fPosX = pTextObj->GetPosX();
1384 FX_FLOAT fPosY = pTextObj->GetPosY();
1385 int nCharInfoIndex = m_TextBuf.GetLength();
1386 CFX_FloatRect charBox;
1387 charBox.top = pTextObj->m_Top;
1388 charBox.left = pTextObj->m_Left;
1389 charBox.right = pTextObj->m_Right;
1390 charBox.bottom = pTextObj->m_Bottom;
1391 for (FX_STRSIZE k = 0; k < nItems; k++) {
1392 FX_WCHAR wChar = actText.GetAt(k);
1393 if (wChar <= 0x80 && !isprint(wChar)) {
1394 wChar = 0x20;
1395 }
1396 if (wChar >= 0xFFFD) {
1397 continue;
1398 }
1399 PAGECHAR_INFO charinfo;
1400 charinfo.m_OriginX = fPosX;
1401 charinfo.m_OriginY = fPosY;
1402 charinfo.m_Index = nCharInfoIndex;
1403 charinfo.m_Unicode = wChar;
1404 charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
1405 charinfo.m_Flag = FPDFTEXT_CHAR_PIECE;
1406 charinfo.m_pTextObj = pTextObj;
1407 charinfo.m_CharBox.top = charBox.top;
1408 charinfo.m_CharBox.left = charBox.left;
1409 charinfo.m_CharBox.right = charBox.right;
1410 charinfo.m_CharBox.bottom = charBox.bottom;
1411 charinfo.m_Matrix.Copy(matrix);
1412 m_TempTextBuf.AppendChar(wChar);
1413 m_TempCharList.Add(charinfo);
1414 }
1415 }
FindPreviousTextObject(void)1416 void CPDF_TextPage::FindPreviousTextObject(void) {
1417 if (m_TempCharList.GetSize() < 1 && m_charList.GetSize() < 1) {
1418 return;
1419 }
1420 PAGECHAR_INFO preChar;
1421 if (m_TempCharList.GetSize() >= 1) {
1422 preChar =
1423 *(PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
1424 } else {
1425 preChar = *(PAGECHAR_INFO*)m_charList.GetAt(m_charList.GetSize() - 1);
1426 }
1427 if (preChar.m_pTextObj) {
1428 m_pPreTextObj = preChar.m_pTextObj;
1429 }
1430 }
SwapTempTextBuf(int32_t iCharListStartAppend,int32_t iBufStartAppend)1431 void CPDF_TextPage::SwapTempTextBuf(int32_t iCharListStartAppend,
1432 int32_t iBufStartAppend) {
1433 int32_t i, j;
1434 i = iCharListStartAppend;
1435 j = m_TempCharList.GetSize() - 1;
1436 for (; i < j; i++, j--) {
1437 std::swap(m_TempCharList[i], m_TempCharList[j]);
1438 std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index);
1439 }
1440 FX_WCHAR* pTempBuffer = m_TempTextBuf.GetBuffer();
1441 i = iBufStartAppend;
1442 j = m_TempTextBuf.GetLength() - 1;
1443 for (; i < j; i++, j--) {
1444 std::swap(pTempBuffer[i], pTempBuffer[j]);
1445 }
1446 }
IsRightToLeft(const CPDF_TextObject * pTextObj,const CPDF_Font * pFont,int nItems) const1447 FX_BOOL CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj,
1448 const CPDF_Font* pFont,
1449 int nItems) const {
1450 std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar);
1451 int32_t nR2L = 0;
1452 int32_t nL2R = 0;
1453 int32_t start = 0, count = 0;
1454 CPDF_TextObjectItem item;
1455 for (int32_t i = 0; i < nItems; i++) {
1456 pTextObj->GetItemInfo(i, &item);
1457 if (item.m_CharCode == (FX_DWORD)-1) {
1458 continue;
1459 }
1460 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1461 FX_WCHAR wChar = wstrItem.GetAt(0);
1462 if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) {
1463 wChar = (FX_WCHAR)item.m_CharCode;
1464 }
1465 if (!wChar) {
1466 continue;
1467 }
1468 if (pBidiChar->AppendChar(wChar)) {
1469 CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
1470 if (ret == CFX_BidiChar::RIGHT) {
1471 nR2L++;
1472 } else if (ret == CFX_BidiChar::LEFT) {
1473 nL2R++;
1474 }
1475 }
1476 }
1477 if (pBidiChar->EndChar()) {
1478 CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
1479 if (ret == CFX_BidiChar::RIGHT) {
1480 nR2L++;
1481 } else if (ret == CFX_BidiChar::LEFT) {
1482 nL2R++;
1483 }
1484 }
1485 return (nR2L > 0 && nR2L >= nL2R);
1486 }
ProcessTextObject(PDFTEXT_Obj Obj)1487 void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
1488 CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1489 if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) {
1490 return;
1491 }
1492 CFX_Matrix formMatrix = Obj.m_formMatrix;
1493 CPDF_Font* pFont = pTextObj->GetFont();
1494 CFX_Matrix matrix;
1495 pTextObj->GetTextMatrix(&matrix);
1496 matrix.Concat(formMatrix);
1497 int32_t bPreMKC = PreMarkedContent(Obj);
1498 if (FPDFTEXT_MC_DONE == bPreMKC) {
1499 m_pPreTextObj = pTextObj;
1500 m_perMatrix.Copy(formMatrix);
1501 return;
1502 }
1503 int result = 0;
1504 if (m_pPreTextObj) {
1505 result = ProcessInsertObject(pTextObj, formMatrix);
1506 if (2 == result) {
1507 m_CurlineRect =
1508 CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
1509 Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
1510 } else {
1511 m_CurlineRect.Union(
1512 CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
1513 Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top));
1514 }
1515 PAGECHAR_INFO generateChar;
1516 if (result == 1) {
1517 if (GenerateCharInfo(TEXT_BLANK_CHAR, generateChar)) {
1518 if (!formMatrix.IsIdentity()) {
1519 generateChar.m_Matrix.Copy(formMatrix);
1520 }
1521 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
1522 m_TempCharList.Add(generateChar);
1523 }
1524 } else if (result == 2) {
1525 CloseTempLine();
1526 if (m_TextBuf.GetSize()) {
1527 if (m_ParseOptions.m_bGetCharCodeOnly) {
1528 m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
1529 m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
1530 } else {
1531 if (GenerateCharInfo(TEXT_RETURN_CHAR, generateChar)) {
1532 m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
1533 if (!formMatrix.IsIdentity()) {
1534 generateChar.m_Matrix.Copy(formMatrix);
1535 }
1536 m_charList.Add(generateChar);
1537 }
1538 if (GenerateCharInfo(TEXT_LINEFEED_CHAR, generateChar)) {
1539 m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
1540 if (!formMatrix.IsIdentity()) {
1541 generateChar.m_Matrix.Copy(formMatrix);
1542 }
1543 m_charList.Add(generateChar);
1544 }
1545 }
1546 }
1547 } else if (result == 3 && !m_ParseOptions.m_bOutputHyphen) {
1548 int32_t nChars = pTextObj->CountChars();
1549 if (nChars == 1) {
1550 CPDF_TextObjectItem item;
1551 pTextObj->GetCharInfo(0, &item);
1552 CFX_WideString wstrItem =
1553 pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1554 if (wstrItem.IsEmpty()) {
1555 wstrItem += (FX_WCHAR)item.m_CharCode;
1556 }
1557 FX_WCHAR curChar = wstrItem.GetAt(0);
1558 if (0x2D == curChar || 0xAD == curChar) {
1559 return;
1560 }
1561 }
1562 while (m_TempTextBuf.GetSize() > 0 &&
1563 m_TempTextBuf.GetWideString().GetAt(m_TempTextBuf.GetLength() -
1564 1) == 0x20) {
1565 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1566 m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
1567 }
1568 PAGECHAR_INFO* cha =
1569 (PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
1570 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1571 cha->m_Unicode = 0x2;
1572 cha->m_Flag = FPDFTEXT_CHAR_HYPHEN;
1573 m_TempTextBuf.AppendChar(0xfffe);
1574 }
1575 } else {
1576 m_CurlineRect =
1577 CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
1578 Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
1579 }
1580 if (FPDFTEXT_MC_DELAY == bPreMKC) {
1581 ProcessMarkedContent(Obj);
1582 m_pPreTextObj = pTextObj;
1583 m_perMatrix.Copy(formMatrix);
1584 return;
1585 }
1586 m_pPreTextObj = pTextObj;
1587 m_perMatrix.Copy(formMatrix);
1588 int nItems = pTextObj->CountItems();
1589 FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix);
1590
1591 const FX_BOOL bR2L = IsRightToLeft(pTextObj, pFont, nItems);
1592 const FX_BOOL bIsBidiAndMirrorInverse =
1593 bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
1594 int32_t iBufStartAppend = m_TempTextBuf.GetLength();
1595 int32_t iCharListStartAppend = m_TempCharList.GetSize();
1596
1597 FX_FLOAT spacing = 0;
1598 for (int i = 0; i < nItems; i++) {
1599 CPDF_TextObjectItem item;
1600 PAGECHAR_INFO charinfo;
1601 charinfo.m_OriginX = 0;
1602 charinfo.m_OriginY = 0;
1603 pTextObj->GetItemInfo(i, &item);
1604 if (item.m_CharCode == (FX_DWORD)-1) {
1605 CFX_WideString str = m_TempTextBuf.GetWideString();
1606 if (str.IsEmpty()) {
1607 str = m_TextBuf.GetWideString();
1608 }
1609 if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) {
1610 continue;
1611 }
1612 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1613 spacing = -fontsize_h * item.m_OriginX / 1000;
1614 continue;
1615 }
1616 FX_FLOAT charSpace = pTextObj->m_TextState.GetObject()->m_CharSpace;
1617 if (charSpace > 0.001) {
1618 spacing += matrix.TransformDistance(charSpace);
1619 } else if (charSpace < -0.001) {
1620 spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
1621 }
1622 spacing -= baseSpace;
1623 if (spacing && i > 0) {
1624 int last_width = 0;
1625 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1626 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
1627 FX_FLOAT threshold = 0;
1628 if (space_charcode != -1) {
1629 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
1630 }
1631 if (threshold > fontsize_h / 3) {
1632 threshold = 0;
1633 } else {
1634 threshold /= 2;
1635 }
1636 if (threshold == 0) {
1637 threshold = fontsize_h;
1638 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
1639 threshold = this_width > last_width ? (FX_FLOAT)this_width
1640 : (FX_FLOAT)last_width;
1641 threshold = _NormalizeThreshold(threshold);
1642 threshold = fontsize_h * threshold / 1000;
1643 }
1644 if (threshold && (spacing && spacing >= threshold)) {
1645 charinfo.m_Unicode = TEXT_BLANK_CHAR;
1646 charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED;
1647 charinfo.m_pTextObj = pTextObj;
1648 charinfo.m_Index = m_TextBuf.GetLength();
1649 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
1650 charinfo.m_CharCode = -1;
1651 charinfo.m_Matrix.Copy(formMatrix);
1652 matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX,
1653 charinfo.m_OriginY);
1654 charinfo.m_CharBox =
1655 CFX_FloatRect(charinfo.m_OriginX, charinfo.m_OriginY,
1656 charinfo.m_OriginX, charinfo.m_OriginY);
1657 m_TempCharList.Add(charinfo);
1658 }
1659 if (item.m_CharCode == (FX_DWORD)-1) {
1660 continue;
1661 }
1662 }
1663 spacing = 0;
1664 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1665 FX_BOOL bNoUnicode = FALSE;
1666 FX_WCHAR wChar = wstrItem.GetAt(0);
1667 if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) {
1668 if (wstrItem.IsEmpty()) {
1669 wstrItem += (FX_WCHAR)item.m_CharCode;
1670 } else {
1671 wstrItem.SetAt(0, (FX_WCHAR)item.m_CharCode);
1672 }
1673 bNoUnicode = TRUE;
1674 }
1675 charinfo.m_Index = -1;
1676 charinfo.m_CharCode = item.m_CharCode;
1677 if (bNoUnicode) {
1678 charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE;
1679 } else {
1680 charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL;
1681 }
1682 charinfo.m_pTextObj = pTextObj;
1683 charinfo.m_OriginX = 0, charinfo.m_OriginY = 0;
1684 matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX,
1685 charinfo.m_OriginY);
1686 FX_RECT rect(0, 0, 0, 0);
1687 rect.Intersect(0, 0, 0, 0);
1688 charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode, rect);
1689 charinfo.m_CharBox.top =
1690 rect.top * pTextObj->GetFontSize() / 1000 + item.m_OriginY;
1691 charinfo.m_CharBox.left =
1692 rect.left * pTextObj->GetFontSize() / 1000 + item.m_OriginX;
1693 charinfo.m_CharBox.right =
1694 rect.right * pTextObj->GetFontSize() / 1000 + item.m_OriginX;
1695 charinfo.m_CharBox.bottom =
1696 rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_OriginY;
1697 if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) {
1698 charinfo.m_CharBox.top =
1699 charinfo.m_CharBox.bottom + pTextObj->GetFontSize();
1700 }
1701 if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) {
1702 charinfo.m_CharBox.right =
1703 charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
1704 }
1705 matrix.TransformRect(charinfo.m_CharBox);
1706 charinfo.m_Matrix.Copy(matrix);
1707 if (wstrItem.IsEmpty()) {
1708 charinfo.m_Unicode = 0;
1709 m_TempCharList.Add(charinfo);
1710 m_TempTextBuf.AppendChar(0xfffe);
1711 continue;
1712 } else {
1713 int nTotal = wstrItem.GetLength();
1714 FX_BOOL bDel = FALSE;
1715 const int count = std::min(m_TempCharList.GetSize(), 7);
1716 FX_FLOAT threshold = charinfo.m_Matrix.TransformXDistance(
1717 (FX_FLOAT)TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize());
1718 for (int n = m_TempCharList.GetSize();
1719 n > m_TempCharList.GetSize() - count; n--) {
1720 PAGECHAR_INFO* charinfo1 = (PAGECHAR_INFO*)m_TempCharList.GetAt(n - 1);
1721 if (charinfo1->m_CharCode == charinfo.m_CharCode &&
1722 charinfo1->m_pTextObj->GetFont() ==
1723 charinfo.m_pTextObj->GetFont() &&
1724 FXSYS_fabs(charinfo1->m_OriginX - charinfo.m_OriginX) < threshold &&
1725 FXSYS_fabs(charinfo1->m_OriginY - charinfo.m_OriginY) < threshold) {
1726 bDel = TRUE;
1727 break;
1728 }
1729 }
1730 if (!bDel) {
1731 for (int nIndex = 0; nIndex < nTotal; nIndex++) {
1732 charinfo.m_Unicode = wstrItem.GetAt(nIndex);
1733 if (charinfo.m_Unicode) {
1734 charinfo.m_Index = m_TextBuf.GetLength();
1735 m_TempTextBuf.AppendChar(charinfo.m_Unicode);
1736 } else {
1737 m_TempTextBuf.AppendChar(0xfffe);
1738 }
1739 m_TempCharList.Add(charinfo);
1740 }
1741 } else if (i == 0) {
1742 CFX_WideString str = m_TempTextBuf.GetWideString();
1743 if (!str.IsEmpty() &&
1744 str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) {
1745 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1746 m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
1747 }
1748 }
1749 }
1750 }
1751 if (bIsBidiAndMirrorInverse) {
1752 SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
1753 }
1754 }
GetTextObjectWritingMode(const CPDF_TextObject * pTextObj)1755 int32_t CPDF_TextPage::GetTextObjectWritingMode(
1756 const CPDF_TextObject* pTextObj) {
1757 int32_t nChars = pTextObj->CountChars();
1758 if (nChars == 1) {
1759 return m_TextlineDir;
1760 }
1761 CPDF_TextObjectItem first, last;
1762 pTextObj->GetCharInfo(0, &first);
1763 pTextObj->GetCharInfo(nChars - 1, &last);
1764 CFX_Matrix textMatrix;
1765 pTextObj->GetTextMatrix(&textMatrix);
1766 textMatrix.TransformPoint(first.m_OriginX, first.m_OriginY);
1767 textMatrix.TransformPoint(last.m_OriginX, last.m_OriginY);
1768 FX_FLOAT dX = FXSYS_fabs(last.m_OriginX - first.m_OriginX);
1769 FX_FLOAT dY = FXSYS_fabs(last.m_OriginY - first.m_OriginY);
1770 if (dX <= 0.0001f && dY <= 0.0001f) {
1771 return -1;
1772 }
1773 CFX_VectorF v;
1774 v.Set(dX, dY);
1775 v.Normalize();
1776 if (v.y <= 0.0872f) {
1777 return v.x <= 0.0872f ? m_TextlineDir : 0;
1778 }
1779 if (v.x <= 0.0872f) {
1780 return 1;
1781 }
1782 return m_TextlineDir;
1783 }
IsHyphen(FX_WCHAR curChar)1784 FX_BOOL CPDF_TextPage::IsHyphen(FX_WCHAR curChar) {
1785 CFX_WideString strCurText = m_TempTextBuf.GetWideString();
1786 if (strCurText.GetLength() == 0) {
1787 strCurText = m_TextBuf.GetWideString();
1788 }
1789 FX_STRSIZE nCount = strCurText.GetLength();
1790 int nIndex = nCount - 1;
1791 FX_WCHAR wcTmp = strCurText.GetAt(nIndex);
1792 while (wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0) {
1793 wcTmp = strCurText.GetAt(--nIndex);
1794 }
1795 if (0x2D == wcTmp || 0xAD == wcTmp) {
1796 if (--nIndex > 0) {
1797 FX_WCHAR preChar = strCurText.GetAt((nIndex));
1798 if (((preChar >= L'A' && preChar <= L'Z') ||
1799 (preChar >= L'a' && preChar <= L'z')) &&
1800 ((curChar >= L'A' && curChar <= L'Z') ||
1801 (curChar >= L'a' && curChar <= L'z'))) {
1802 return TRUE;
1803 }
1804 }
1805 int size = m_TempCharList.GetSize();
1806 PAGECHAR_INFO preChar;
1807 if (size) {
1808 preChar = (PAGECHAR_INFO)m_TempCharList[size - 1];
1809 } else {
1810 size = m_charList.GetSize();
1811 if (size == 0) {
1812 return FALSE;
1813 }
1814 preChar = (PAGECHAR_INFO)m_charList[size - 1];
1815 }
1816 if (FPDFTEXT_CHAR_PIECE == preChar.m_Flag &&
1817 (0xAD == preChar.m_Unicode || 0x2D == preChar.m_Unicode)) {
1818 return TRUE;
1819 }
1820 }
1821 return FALSE;
1822 }
ProcessInsertObject(const CPDF_TextObject * pObj,const CFX_Matrix & formMatrix)1823 int CPDF_TextPage::ProcessInsertObject(const CPDF_TextObject* pObj,
1824 const CFX_Matrix& formMatrix) {
1825 FindPreviousTextObject();
1826 FX_BOOL bNewline = FALSE;
1827 int WritingMode = GetTextObjectWritingMode(pObj);
1828 if (WritingMode == -1) {
1829 WritingMode = GetTextObjectWritingMode(m_pPreTextObj);
1830 }
1831 CFX_FloatRect this_rect(pObj->m_Left, pObj->m_Bottom, pObj->m_Right,
1832 pObj->m_Top);
1833 CFX_FloatRect prev_rect(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom,
1834 m_pPreTextObj->m_Right, m_pPreTextObj->m_Top);
1835 CPDF_TextObjectItem PrevItem, item;
1836 int nItem = m_pPreTextObj->CountItems();
1837 m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem);
1838 pObj->GetItemInfo(0, &item);
1839 CFX_WideString wstrItem =
1840 pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1841 if (wstrItem.IsEmpty()) {
1842 wstrItem += (FX_WCHAR)item.m_CharCode;
1843 }
1844 FX_WCHAR curChar = wstrItem.GetAt(0);
1845 if (WritingMode == 0) {
1846 if (this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) {
1847 FX_FLOAT top =
1848 this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top;
1849 FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom
1850 : prev_rect.bottom;
1851 if (bottom >= top) {
1852 if (IsHyphen(curChar)) {
1853 return 3;
1854 }
1855 return 2;
1856 }
1857 }
1858 } else if (WritingMode == 1) {
1859 if (this_rect.Width() > pObj->GetFontSize() * 0.1f &&
1860 prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) {
1861 FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left
1862 : m_CurlineRect.left;
1863 FX_FLOAT right = this_rect.right < m_CurlineRect.right
1864 ? this_rect.right
1865 : m_CurlineRect.right;
1866 if (right <= left) {
1867 if (IsHyphen(curChar)) {
1868 return 3;
1869 }
1870 return 2;
1871 }
1872 }
1873 }
1874 FX_FLOAT last_pos = PrevItem.m_OriginX;
1875 int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont());
1876 FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000;
1877 last_width = FXSYS_fabs(last_width);
1878 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
1879 FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
1880 this_width = FXSYS_fabs(this_width);
1881 FX_FLOAT threshold =
1882 last_width > this_width ? last_width / 4 : this_width / 4;
1883 CFX_Matrix prev_matrix, prev_reverse;
1884 m_pPreTextObj->GetTextMatrix(&prev_matrix);
1885 prev_matrix.Concat(m_perMatrix);
1886 prev_reverse.SetReverse(prev_matrix);
1887 FX_FLOAT x = pObj->GetPosX();
1888 FX_FLOAT y = pObj->GetPosY();
1889 formMatrix.Transform(x, y);
1890 prev_reverse.Transform(x, y);
1891 if (last_width < this_width) {
1892 threshold = prev_reverse.TransformDistance(threshold);
1893 }
1894 CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom,
1895 m_pPreTextObj->m_Right, pObj->m_Top);
1896 CFX_FloatRect rect2(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom,
1897 m_pPreTextObj->m_Right, m_pPreTextObj->m_Top);
1898 CFX_FloatRect rect3 = rect1;
1899 rect1.Intersect(rect2);
1900 if (WritingMode == 0) {
1901 if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) ||
1902 ((y > threshold * 2 || y < threshold * -3) &&
1903 (FXSYS_fabs(y) < 1 ? FXSYS_fabs(x) < FXSYS_fabs(y) : TRUE))) {
1904 bNewline = TRUE;
1905 if (nItem > 1) {
1906 CPDF_TextObjectItem tempItem;
1907 m_pPreTextObj->GetItemInfo(0, &tempItem);
1908 CFX_Matrix m;
1909 m_pPreTextObj->GetTextMatrix(&m);
1910 if (PrevItem.m_OriginX > tempItem.m_OriginX &&
1911 m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
1912 m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 &&
1913 m.c < 0.1) {
1914 CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000,
1915 m_pPreTextObj->m_Top);
1916 if (re.Contains(pObj->GetPosX(), pObj->GetPosY())) {
1917 bNewline = FALSE;
1918 } else {
1919 CFX_FloatRect re(0, pObj->m_Bottom, 1000, pObj->m_Top);
1920 if (re.Contains(m_pPreTextObj->GetPosX(),
1921 m_pPreTextObj->GetPosY())) {
1922 bNewline = FALSE;
1923 }
1924 }
1925 }
1926 }
1927 }
1928 }
1929 if (bNewline)
1930 return IsHyphen(curChar) ? 3 : 2;
1931
1932 int32_t nChars = pObj->CountChars();
1933 if (nChars == 1 && (0x2D == curChar || 0xAD == curChar) &&
1934 IsHyphen(curChar)) {
1935 return 3;
1936 }
1937 CFX_WideString PrevStr =
1938 m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode);
1939 FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1);
1940 CFX_Matrix matrix;
1941 pObj->GetTextMatrix(&matrix);
1942 matrix.Concat(formMatrix);
1943 threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
1944 threshold = threshold > 400
1945 ? (threshold < 700
1946 ? threshold / 4
1947 : (threshold > 800 ? threshold / 6 : threshold / 5))
1948 : (threshold / 2);
1949 if (nLastWidth >= nThisWidth) {
1950 threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize());
1951 } else {
1952 threshold *= FXSYS_fabs(pObj->GetFontSize());
1953 threshold = matrix.TransformDistance(threshold);
1954 threshold = prev_reverse.TransformDistance(threshold);
1955 }
1956 threshold /= 1000;
1957 if ((threshold < 1.4881 && threshold > 1.4879) ||
1958 (threshold < 1.39001 && threshold > 1.38999)) {
1959 threshold *= 1.5;
1960 }
1961 if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' &&
1962 preChar != L' ') {
1963 if (curChar != L' ' && preChar != L' ') {
1964 if ((x - last_pos - last_width) > threshold ||
1965 (last_pos - x - last_width) > threshold) {
1966 return 1;
1967 }
1968 if (x < 0 && (last_pos - x - last_width) > threshold) {
1969 return 1;
1970 }
1971 if ((x - last_pos - last_width) > this_width ||
1972 (x - last_pos - this_width) > last_width) {
1973 return 1;
1974 }
1975 }
1976 }
1977 return 0;
1978 }
IsSameTextObject(CPDF_TextObject * pTextObj1,CPDF_TextObject * pTextObj2)1979 FX_BOOL CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
1980 CPDF_TextObject* pTextObj2) {
1981 if (!pTextObj1 || !pTextObj2) {
1982 return FALSE;
1983 }
1984 CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom,
1985 pTextObj2->m_Right, pTextObj2->m_Top);
1986 CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom,
1987 pTextObj1->m_Right, pTextObj1->m_Top);
1988 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty() &&
1989 !m_ParseOptions.m_bGetCharCodeOnly) {
1990 FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left);
1991 int nCount = m_charList.GetSize();
1992 if (nCount >= 2) {
1993 PAGECHAR_INFO perCharTemp = (PAGECHAR_INFO)m_charList[nCount - 2];
1994 FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width();
1995 if (dbXdif > dbSpace) {
1996 return FALSE;
1997 }
1998 }
1999 }
2000 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
2001 rcPreObj.Intersect(rcCurObj);
2002 if (rcPreObj.IsEmpty()) {
2003 return FALSE;
2004 }
2005 if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
2006 rcCurObj.Width() / 2) {
2007 return FALSE;
2008 }
2009 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
2010 return FALSE;
2011 }
2012 }
2013 int nPreCount = pTextObj2->CountItems();
2014 int nCurCount = pTextObj1->CountItems();
2015 if (nPreCount != nCurCount) {
2016 return FALSE;
2017 }
2018 CPDF_TextObjectItem itemPer, itemCur;
2019 for (int i = 0; i < nPreCount; i++) {
2020 pTextObj2->GetItemInfo(i, &itemPer);
2021 pTextObj1->GetItemInfo(i, &itemCur);
2022 if (itemCur.m_CharCode != itemPer.m_CharCode) {
2023 return FALSE;
2024 }
2025 }
2026 if (FXSYS_fabs(pTextObj1->GetPosX() - pTextObj2->GetPosX()) >
2027 GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont()) *
2028 pTextObj2->GetFontSize() / 1000 * 0.9 ||
2029 FXSYS_fabs(pTextObj1->GetPosY() - pTextObj2->GetPosY()) >
2030 std::max(std::max(rcPreObj.Height(), rcPreObj.Width()),
2031 pTextObj2->GetFontSize()) /
2032 8) {
2033 return FALSE;
2034 }
2035 return TRUE;
2036 }
IsSameAsPreTextObject(CPDF_TextObject * pTextObj,FX_POSITION ObjPos)2037 FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
2038 FX_POSITION ObjPos) {
2039 if (!pTextObj) {
2040 return FALSE;
2041 }
2042 int i = 0;
2043 if (!ObjPos) {
2044 ObjPos = m_pPage->GetLastObjectPosition();
2045 }
2046 CPDF_PageObject* pObj = m_pPage->GetPrevObject(ObjPos);
2047 while (i < 5 && ObjPos) {
2048 pObj = m_pPage->GetPrevObject(ObjPos);
2049 if (pObj == pTextObj) {
2050 continue;
2051 }
2052 if (pObj->m_Type != PDFPAGE_TEXT) {
2053 continue;
2054 }
2055 if (IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) {
2056 return TRUE;
2057 }
2058 i++;
2059 }
2060 return FALSE;
2061 }
2062
GenerateCharInfo(FX_WCHAR unicode,PAGECHAR_INFO & info)2063 FX_BOOL CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) {
2064 int size = m_TempCharList.GetSize();
2065 PAGECHAR_INFO preChar;
2066 if (size) {
2067 preChar = (PAGECHAR_INFO)m_TempCharList[size - 1];
2068 } else {
2069 size = m_charList.GetSize();
2070 if (size == 0) {
2071 return FALSE;
2072 }
2073 preChar = (PAGECHAR_INFO)m_charList[size - 1];
2074 }
2075 info.m_Index = m_TextBuf.GetLength();
2076 info.m_Unicode = unicode;
2077 info.m_pTextObj = NULL;
2078 info.m_CharCode = -1;
2079 info.m_Flag = FPDFTEXT_CHAR_GENERATED;
2080 int preWidth = 0;
2081 if (preChar.m_pTextObj && preChar.m_CharCode != (FX_DWORD)-1)
2082 preWidth = GetCharWidth(preChar.m_CharCode, preChar.m_pTextObj->GetFont());
2083
2084 FX_FLOAT fFontSize = preChar.m_pTextObj ? preChar.m_pTextObj->GetFontSize()
2085 : preChar.m_CharBox.Height();
2086 if (!fFontSize)
2087 fFontSize = kDefaultFontSize;
2088
2089 info.m_OriginX = preChar.m_OriginX + preWidth * (fFontSize) / 1000;
2090 info.m_OriginY = preChar.m_OriginY;
2091 info.m_CharBox = CFX_FloatRect(info.m_OriginX, info.m_OriginY, info.m_OriginX,
2092 info.m_OriginY);
2093 return TRUE;
2094 }
2095
IsRectIntersect(const CFX_FloatRect & rect1,const CFX_FloatRect & rect2)2096 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,
2097 const CFX_FloatRect& rect2) {
2098 CFX_FloatRect rect = rect1;
2099 rect.Intersect(rect2);
2100 return !rect.IsEmpty();
2101 }
IsLetter(FX_WCHAR unicode)2102 FX_BOOL CPDF_TextPage::IsLetter(FX_WCHAR unicode) {
2103 if (unicode < L'A') {
2104 return FALSE;
2105 }
2106 if (unicode > L'Z' && unicode < L'a') {
2107 return FALSE;
2108 }
2109 if (unicode > L'z') {
2110 return FALSE;
2111 }
2112 return TRUE;
2113 }
CPDF_TextPageFind(const IPDF_TextPage * pTextPage)2114 CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage)
2115 : m_pTextPage(pTextPage),
2116 m_flags(0),
2117 m_findNextStart(-1),
2118 m_findPreStart(-1),
2119 m_bMatchCase(FALSE),
2120 m_bMatchWholeWord(FALSE),
2121 m_resStart(0),
2122 m_resEnd(-1),
2123 m_IsFind(FALSE) {
2124 m_strText = m_pTextPage->GetPageText();
2125 int nCount = pTextPage->CountChars();
2126 if (nCount) {
2127 m_CharIndex.Add(0);
2128 }
2129 for (int i = 0; i < nCount; i++) {
2130 FPDF_CHAR_INFO info;
2131 pTextPage->GetCharInfo(i, &info);
2132 int indexSize = m_CharIndex.GetSize();
2133 if (info.m_Flag == CHAR_NORMAL || info.m_Flag == CHAR_GENERATED) {
2134 if (indexSize % 2) {
2135 m_CharIndex.Add(1);
2136 } else {
2137 if (indexSize <= 0) {
2138 continue;
2139 }
2140 m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1);
2141 }
2142 } else {
2143 if (indexSize % 2) {
2144 if (indexSize <= 0) {
2145 continue;
2146 }
2147 m_CharIndex.SetAt(indexSize - 1, i + 1);
2148 } else {
2149 m_CharIndex.Add(i + 1);
2150 }
2151 }
2152 }
2153 int indexSize = m_CharIndex.GetSize();
2154 if (indexSize % 2) {
2155 m_CharIndex.RemoveAt(indexSize - 1);
2156 }
2157 }
GetCharIndex(int index) const2158 int CPDF_TextPageFind::GetCharIndex(int index) const {
2159 return m_pTextPage->CharIndexFromTextIndex(index);
2160 int indexSize = m_CharIndex.GetSize();
2161 int count = 0;
2162 for (int i = 0; i < indexSize; i += 2) {
2163 count += m_CharIndex.GetAt(i + 1);
2164 if (count > index) {
2165 return index - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i);
2166 }
2167 }
2168 return -1;
2169 }
FindFirst(const CFX_WideString & findwhat,int flags,int startPos)2170 FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
2171 int flags,
2172 int startPos) {
2173 if (!m_pTextPage) {
2174 return FALSE;
2175 }
2176 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) {
2177 m_strText = m_pTextPage->GetPageText();
2178 }
2179 CFX_WideString findwhatStr = findwhat;
2180 m_findWhat = findwhatStr;
2181 m_flags = flags;
2182 m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
2183 if (m_strText.IsEmpty()) {
2184 m_IsFind = FALSE;
2185 return TRUE;
2186 }
2187 FX_STRSIZE len = findwhatStr.GetLength();
2188 if (!m_bMatchCase) {
2189 findwhatStr.MakeLower();
2190 m_strText.MakeLower();
2191 }
2192 m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;
2193 m_findNextStart = startPos;
2194 if (startPos == -1) {
2195 m_findPreStart = m_strText.GetLength() - 1;
2196 } else {
2197 m_findPreStart = startPos;
2198 }
2199 m_csFindWhatArray.RemoveAll();
2200 int i = 0;
2201 while (i < len) {
2202 if (findwhatStr.GetAt(i) != ' ') {
2203 break;
2204 }
2205 i++;
2206 }
2207 if (i < len) {
2208 ExtractFindWhat(findwhatStr);
2209 } else {
2210 m_csFindWhatArray.Add(findwhatStr);
2211 }
2212 if (m_csFindWhatArray.GetSize() <= 0) {
2213 return FALSE;
2214 }
2215 m_IsFind = TRUE;
2216 m_resStart = 0;
2217 m_resEnd = -1;
2218 return TRUE;
2219 }
FindNext()2220 FX_BOOL CPDF_TextPageFind::FindNext() {
2221 if (!m_pTextPage) {
2222 return FALSE;
2223 }
2224 m_resArray.RemoveAll();
2225 if (m_findNextStart == -1) {
2226 return FALSE;
2227 }
2228 if (m_strText.IsEmpty()) {
2229 m_IsFind = FALSE;
2230 return m_IsFind;
2231 }
2232 int strLen = m_strText.GetLength();
2233 if (m_findNextStart > strLen - 1) {
2234 m_IsFind = FALSE;
2235 return m_IsFind;
2236 }
2237 int nCount = m_csFindWhatArray.GetSize();
2238 int nResultPos = 0;
2239 int nStartPos = 0;
2240 nStartPos = m_findNextStart;
2241 FX_BOOL bSpaceStart = FALSE;
2242 for (int iWord = 0; iWord < nCount; iWord++) {
2243 CFX_WideString csWord = m_csFindWhatArray[iWord];
2244 if (csWord.IsEmpty()) {
2245 if (iWord == nCount - 1) {
2246 FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
2247 if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_BLANK_CHAR ||
2248 strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
2249 nResultPos = nStartPos + 1;
2250 break;
2251 }
2252 iWord = -1;
2253 } else if (iWord == 0) {
2254 bSpaceStart = TRUE;
2255 }
2256 continue;
2257 }
2258 int endIndex;
2259 nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
2260 if (nResultPos == -1) {
2261 m_IsFind = FALSE;
2262 return m_IsFind;
2263 }
2264 endIndex = nResultPos + csWord.GetLength() - 1;
2265 if (iWord == 0) {
2266 m_resStart = nResultPos;
2267 }
2268 FX_BOOL bMatch = TRUE;
2269 if (iWord != 0 && !bSpaceStart) {
2270 int PreResEndPos = nStartPos;
2271 int curChar = csWord.GetAt(0);
2272 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
2273 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
2274 if (nStartPos == nResultPos &&
2275 !(_IsIgnoreSpaceCharacter(lastChar) ||
2276 _IsIgnoreSpaceCharacter(curChar))) {
2277 bMatch = FALSE;
2278 }
2279 for (int d = PreResEndPos; d < nResultPos; d++) {
2280 FX_WCHAR strInsert = m_strText.GetAt(d);
2281 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR &&
2282 strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
2283 bMatch = FALSE;
2284 break;
2285 }
2286 }
2287 } else if (bSpaceStart) {
2288 if (nResultPos > 0) {
2289 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
2290 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR &&
2291 strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
2292 bMatch = FALSE;
2293 m_resStart = nResultPos;
2294 } else {
2295 m_resStart = nResultPos - 1;
2296 }
2297 }
2298 }
2299 if (m_bMatchWholeWord && bMatch) {
2300 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
2301 }
2302 nStartPos = endIndex + 1;
2303 if (!bMatch) {
2304 iWord = -1;
2305 if (bSpaceStart) {
2306 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
2307 } else {
2308 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
2309 }
2310 }
2311 }
2312 m_resEnd = nResultPos +
2313 m_csFindWhatArray[m_csFindWhatArray.GetSize() - 1].GetLength() - 1;
2314 m_IsFind = TRUE;
2315 int resStart = GetCharIndex(m_resStart);
2316 int resEnd = GetCharIndex(m_resEnd);
2317 m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1, m_resArray);
2318 if (m_flags & FPDFTEXT_CONSECUTIVE) {
2319 m_findNextStart = m_resStart + 1;
2320 m_findPreStart = m_resEnd - 1;
2321 } else {
2322 m_findNextStart = m_resEnd + 1;
2323 m_findPreStart = m_resStart - 1;
2324 }
2325 return m_IsFind;
2326 }
FindPrev()2327 FX_BOOL CPDF_TextPageFind::FindPrev() {
2328 if (!m_pTextPage) {
2329 return FALSE;
2330 }
2331 m_resArray.RemoveAll();
2332 if (m_strText.IsEmpty() || m_findPreStart < 0) {
2333 m_IsFind = FALSE;
2334 return m_IsFind;
2335 }
2336 CPDF_TextPageFind findEngine(m_pTextPage);
2337 FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);
2338 if (!ret) {
2339 m_IsFind = FALSE;
2340 return m_IsFind;
2341 }
2342 int order = -1, MatchedCount = 0;
2343 while (ret) {
2344 ret = findEngine.FindNext();
2345 if (ret) {
2346 int order1 = findEngine.GetCurOrder();
2347 int MatchedCount1 = findEngine.GetMatchedCount();
2348 if (((order1 + MatchedCount1) - 1) > m_findPreStart) {
2349 break;
2350 }
2351 order = order1;
2352 MatchedCount = MatchedCount1;
2353 }
2354 }
2355 if (order == -1) {
2356 m_IsFind = FALSE;
2357 return m_IsFind;
2358 }
2359 m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
2360 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
2361 m_IsFind = TRUE;
2362 m_pTextPage->GetRectArray(order, MatchedCount, m_resArray);
2363 if (m_flags & FPDFTEXT_CONSECUTIVE) {
2364 m_findNextStart = m_resStart + 1;
2365 m_findPreStart = m_resEnd - 1;
2366 } else {
2367 m_findNextStart = m_resEnd + 1;
2368 m_findPreStart = m_resStart - 1;
2369 }
2370 return m_IsFind;
2371 }
ExtractFindWhat(const CFX_WideString & findwhat)2372 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
2373 if (findwhat.IsEmpty()) {
2374 return;
2375 }
2376 int index = 0;
2377 while (1) {
2378 CFX_WideString csWord = TEXT_EMPTY;
2379 int ret =
2380 ExtractSubString(csWord, findwhat.c_str(), index, TEXT_BLANK_CHAR);
2381 if (csWord.IsEmpty()) {
2382 if (ret) {
2383 m_csFindWhatArray.Add(CFX_WideString(L""));
2384 index++;
2385 continue;
2386 } else {
2387 break;
2388 }
2389 }
2390 int pos = 0;
2391 while (pos < csWord.GetLength()) {
2392 CFX_WideString curStr = csWord.Mid(pos, 1);
2393 FX_WCHAR curChar = csWord.GetAt(pos);
2394 if (_IsIgnoreSpaceCharacter(curChar)) {
2395 if (pos > 0 && curChar == 0x2019) {
2396 pos++;
2397 continue;
2398 }
2399 if (pos > 0) {
2400 CFX_WideString preStr = csWord.Mid(0, pos);
2401 m_csFindWhatArray.Add(preStr);
2402 }
2403 m_csFindWhatArray.Add(curStr);
2404 if (pos == csWord.GetLength() - 1) {
2405 csWord.Empty();
2406 break;
2407 }
2408 csWord = csWord.Right(csWord.GetLength() - pos - 1);
2409 pos = 0;
2410 continue;
2411 }
2412 pos++;
2413 }
2414 if (!csWord.IsEmpty()) {
2415 m_csFindWhatArray.Add(csWord);
2416 }
2417 index++;
2418 }
2419 }
IsMatchWholeWord(const CFX_WideString & csPageText,int startPos,int endPos)2420 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
2421 int startPos,
2422 int endPos) {
2423 FX_WCHAR char_left = 0;
2424 FX_WCHAR char_right = 0;
2425 int char_count = endPos - startPos + 1;
2426 if (char_count < 1) {
2427 return FALSE;
2428 }
2429 if (char_count == 1 && csPageText.GetAt(startPos) > 255) {
2430 return TRUE;
2431 }
2432 if (startPos - 1 >= 0) {
2433 char_left = csPageText.GetAt(startPos - 1);
2434 }
2435 if (startPos + char_count < csPageText.GetLength()) {
2436 char_right = csPageText.GetAt(startPos + char_count);
2437 }
2438 if ((char_left > 'A' && char_left < 'a') ||
2439 (char_left > 'a' && char_left < 'z') ||
2440 (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
2441 (char_right > 'A' && char_right < 'a') ||
2442 (char_right > 'a' && char_right < 'z') ||
2443 (char_right > 0xfb00 && char_right < 0xfb06) ||
2444 std::iswdigit(char_right)) {
2445 return FALSE;
2446 }
2447 if (!(('A' > char_left || char_left > 'Z') &&
2448 ('a' > char_left || char_left > 'z') &&
2449 ('A' > char_right || char_right > 'Z') &&
2450 ('a' > char_right || char_right > 'z'))) {
2451 return FALSE;
2452 }
2453 if (char_count > 0) {
2454 if (csPageText.GetAt(startPos) >= L'0' &&
2455 csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
2456 char_left <= L'9') {
2457 return FALSE;
2458 }
2459 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
2460 char_right >= L'0' && char_right <= L'9') {
2461 return FALSE;
2462 }
2463 }
2464 return TRUE;
2465 }
ExtractSubString(CFX_WideString & rString,const FX_WCHAR * lpszFullString,int iSubString,FX_WCHAR chSep)2466 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
2467 const FX_WCHAR* lpszFullString,
2468 int iSubString,
2469 FX_WCHAR chSep) {
2470 if (!lpszFullString) {
2471 return FALSE;
2472 }
2473 while (iSubString--) {
2474 lpszFullString = FXSYS_wcschr(lpszFullString, chSep);
2475 if (!lpszFullString) {
2476 rString.Empty();
2477 return FALSE;
2478 }
2479 lpszFullString++;
2480 while (*lpszFullString == chSep) {
2481 lpszFullString++;
2482 }
2483 }
2484 const FX_WCHAR* lpchEnd = FXSYS_wcschr(lpszFullString, chSep);
2485 int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
2486 : (int)FXSYS_wcslen(lpszFullString);
2487 ASSERT(nLen >= 0);
2488 FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
2489 nLen * sizeof(FX_WCHAR));
2490 rString.ReleaseBuffer();
2491 return TRUE;
2492 }
MakeReverse(const CFX_WideString & str)2493 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
2494 CFX_WideString str2;
2495 str2.Empty();
2496 int nlen = str.GetLength();
2497 for (int i = nlen - 1; i >= 0; i--) {
2498 str2 += str.GetAt(i);
2499 }
2500 return str2;
2501 }
GetRectArray(CFX_RectArray & rects) const2502 void CPDF_TextPageFind::GetRectArray(CFX_RectArray& rects) const {
2503 rects.Copy(m_resArray);
2504 }
GetCurOrder() const2505 int CPDF_TextPageFind::GetCurOrder() const {
2506 return GetCharIndex(m_resStart);
2507 }
GetMatchedCount() const2508 int CPDF_TextPageFind::GetMatchedCount() const {
2509 int resStart = GetCharIndex(m_resStart);
2510 int resEnd = GetCharIndex(m_resEnd);
2511 return resEnd - resStart + 1;
2512 }
2513
CPDF_LinkExtract()2514 CPDF_LinkExtract::CPDF_LinkExtract()
2515 : m_pTextPage(nullptr), m_bIsParsed(false) {
2516 }
2517
~CPDF_LinkExtract()2518 CPDF_LinkExtract::~CPDF_LinkExtract() {
2519 DeleteLinkList();
2520 }
2521
ExtractLinks(const IPDF_TextPage * pTextPage)2522 FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) {
2523 if (!pTextPage || !pTextPage->IsParsed())
2524 return FALSE;
2525
2526 m_pTextPage = (const CPDF_TextPage*)pTextPage;
2527 m_strPageText = m_pTextPage->GetPageText(0, -1);
2528 DeleteLinkList();
2529 if (m_strPageText.IsEmpty()) {
2530 return FALSE;
2531 }
2532 ParseLink();
2533 m_bIsParsed = true;
2534 return TRUE;
2535 }
2536
DeleteLinkList()2537 void CPDF_LinkExtract::DeleteLinkList() {
2538 while (m_LinkList.GetSize()) {
2539 CPDF_LinkExt* linkinfo = NULL;
2540 linkinfo = m_LinkList.GetAt(0);
2541 m_LinkList.RemoveAt(0);
2542 delete linkinfo;
2543 }
2544 m_LinkList.RemoveAll();
2545 }
CountLinks() const2546 int CPDF_LinkExtract::CountLinks() const {
2547 if (!m_bIsParsed) {
2548 return -1;
2549 }
2550 return m_LinkList.GetSize();
2551 }
ParseLink()2552 void CPDF_LinkExtract::ParseLink() {
2553 int start = 0, pos = 0;
2554 int TotalChar = m_pTextPage->CountChars();
2555 while (pos < TotalChar) {
2556 FPDF_CHAR_INFO pageChar;
2557 m_pTextPage->GetCharInfo(pos, &pageChar);
2558 if (pageChar.m_Flag == CHAR_GENERATED || pageChar.m_Unicode == 0x20 ||
2559 pos == TotalChar - 1) {
2560 int nCount = pos - start;
2561 if (pos == TotalChar - 1) {
2562 nCount++;
2563 }
2564 CFX_WideString strBeCheck;
2565 strBeCheck = m_pTextPage->GetPageText(start, nCount);
2566 if (strBeCheck.GetLength() > 5) {
2567 while (strBeCheck.GetLength() > 0) {
2568 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
2569 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
2570 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
2571 nCount--;
2572 } else {
2573 break;
2574 }
2575 }
2576 if (nCount > 5 &&
2577 (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
2578 AppendToLinkList(start, nCount, strBeCheck);
2579 }
2580 }
2581 start = ++pos;
2582 } else {
2583 pos++;
2584 }
2585 }
2586 }
CheckWebLink(CFX_WideString & strBeCheck)2587 FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
2588 CFX_WideString str = strBeCheck;
2589 str.MakeLower();
2590 if (str.Find(L"http://www.") != -1) {
2591 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
2592 return TRUE;
2593 }
2594 if (str.Find(L"http://") != -1) {
2595 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
2596 return TRUE;
2597 }
2598 if (str.Find(L"https://www.") != -1) {
2599 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
2600 return TRUE;
2601 }
2602 if (str.Find(L"https://") != -1) {
2603 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
2604 return TRUE;
2605 }
2606 if (str.Find(L"www.") != -1) {
2607 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
2608 strBeCheck = L"http://" + strBeCheck;
2609 return TRUE;
2610 }
2611 return FALSE;
2612 }
CheckMailLink(CFX_WideString & str)2613 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
2614 int aPos = str.Find(L'@');
2615 // Invalid when no '@'.
2616 if (aPos < 1) {
2617 return FALSE;
2618 }
2619
2620 // Check the local part.
2621 int pPos = aPos; // Used to track the position of '@' or '.'.
2622 for (int i = aPos - 1; i >= 0; i--) {
2623 FX_WCHAR ch = str.GetAt(i);
2624 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) {
2625 continue;
2626 }
2627 if (ch != L'.' || i == pPos - 1 || i == 0) {
2628 if (i == aPos - 1) {
2629 // There is '.' or invalid char before '@'.
2630 return FALSE;
2631 }
2632 // End extracting for other invalid chars, '.' at the beginning, or
2633 // consecutive '.'.
2634 int removed_len = i == pPos - 1 ? i + 2 : i + 1;
2635 str = str.Right(str.GetLength() - removed_len);
2636 break;
2637 }
2638 // Found a valid '.'.
2639 pPos = i;
2640 }
2641
2642 // Check the domain name part.
2643 aPos = str.Find(L'@');
2644 if (aPos < 1) {
2645 return FALSE;
2646 }
2647 str.TrimRight(L'.');
2648 // At least one '.' in domain name, but not at the beginning.
2649 // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
2650 // Check whether we should remove this check.
2651 int ePos = str.Find(L'.', aPos + 1);
2652 if (ePos == -1 || ePos == aPos + 1) {
2653 return FALSE;
2654 }
2655 // Validate all other chars in domain name.
2656 int nLen = str.GetLength();
2657 pPos = 0; // Used to track the position of '.'.
2658 for (int i = aPos + 1; i < nLen; i++) {
2659 FX_WCHAR wch = str.GetAt(i);
2660 if (wch == L'-' || FXSYS_iswalnum(wch)) {
2661 continue;
2662 }
2663 if (wch != L'.' || i == pPos + 1) {
2664 // Domain name should end before invalid char.
2665 int host_end = i == pPos + 1 ? i - 2 : i - 1;
2666 if (pPos > 0 && host_end - aPos >= 3) {
2667 // Trim the ending invalid chars if there is at least one '.' and name.
2668 str = str.Left(host_end + 1);
2669 break;
2670 }
2671 return FALSE;
2672 }
2673 pPos = i;
2674 }
2675
2676 if (str.Find(L"mailto:") == -1) {
2677 str = L"mailto:" + str;
2678 }
2679 return TRUE;
2680 }
2681
AppendToLinkList(int start,int count,const CFX_WideString & strUrl)2682 void CPDF_LinkExtract::AppendToLinkList(int start,
2683 int count,
2684 const CFX_WideString& strUrl) {
2685 CPDF_LinkExt* linkInfo = new CPDF_LinkExt;
2686 linkInfo->m_strUrl = strUrl;
2687 linkInfo->m_Start = start;
2688 linkInfo->m_Count = count;
2689 m_LinkList.Add(linkInfo);
2690 }
2691
GetURL(int index) const2692 CFX_WideString CPDF_LinkExtract::GetURL(int index) const {
2693 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2694 return L"";
2695 }
2696 CPDF_LinkExt* link = NULL;
2697 link = m_LinkList.GetAt(index);
2698 if (!link) {
2699 return L"";
2700 }
2701 return link->m_strUrl;
2702 }
GetBoundedSegment(int index,int & start,int & count) const2703 void CPDF_LinkExtract::GetBoundedSegment(int index,
2704 int& start,
2705 int& count) const {
2706 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2707 return;
2708 }
2709 CPDF_LinkExt* link = NULL;
2710 link = m_LinkList.GetAt(index);
2711 if (!link) {
2712 return;
2713 }
2714 start = link->m_Start;
2715 count = link->m_Count;
2716 }
GetRects(int index,CFX_RectArray & rects) const2717 void CPDF_LinkExtract::GetRects(int index, CFX_RectArray& rects) const {
2718 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2719 return;
2720 }
2721 CPDF_LinkExt* link = NULL;
2722 link = m_LinkList.GetAt(index);
2723 if (!link) {
2724 return;
2725 }
2726 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);
2727 }
2728