1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdftext/cpdf_textpage.h"
8 
9 #include <algorithm>
10 #include <utility>
11 #include <vector>
12 
13 #include "core/fpdfapi/font/cpdf_font.h"
14 #include "core/fpdfapi/page/cpdf_form.h"
15 #include "core/fpdfapi/page/cpdf_formobject.h"
16 #include "core/fpdfapi/page/cpdf_page.h"
17 #include "core/fpdfapi/page/cpdf_pageobject.h"
18 #include "core/fpdfapi/page/cpdf_textobject.h"
19 #include "core/fpdfapi/parser/cpdf_dictionary.h"
20 #include "core/fpdfapi/parser/cpdf_string.h"
21 #include "core/fpdftext/unicodenormalizationdata.h"
22 #include "core/fxcrt/fx_bidi.h"
23 #include "core/fxcrt/fx_ext.h"
24 #include "core/fxcrt/fx_ucd.h"
25 #include "third_party/base/stl_util.h"
26 
27 namespace {
28 
29 const FX_FLOAT kDefaultFontSize = 1.0f;
30 const uint16_t* const g_UnicodeData_Normalization_Maps[5] = {
31     nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2,
32     g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4};
33 
NormalizeThreshold(FX_FLOAT threshold)34 FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) {
35   if (threshold < 300)
36     return threshold / 2.0f;
37   if (threshold < 500)
38     return threshold / 4.0f;
39   if (threshold < 700)
40     return threshold / 5.0f;
41   return threshold / 6.0f;
42 }
43 
CalculateBaseSpace(const CPDF_TextObject * pTextObj,const CFX_Matrix & matrix)44 FX_FLOAT CalculateBaseSpace(const CPDF_TextObject* pTextObj,
45                             const CFX_Matrix& matrix) {
46   FX_FLOAT baseSpace = 0.0;
47   const int nItems = pTextObj->CountItems();
48   if (pTextObj->m_TextState.GetCharSpace() && nItems >= 3) {
49     bool bAllChar = true;
50     FX_FLOAT spacing =
51         matrix.TransformDistance(pTextObj->m_TextState.GetCharSpace());
52     baseSpace = spacing;
53     for (int i = 0; i < nItems; i++) {
54       CPDF_TextObjectItem item;
55       pTextObj->GetItemInfo(i, &item);
56       if (item.m_CharCode == static_cast<uint32_t>(-1)) {
57         FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
58         FX_FLOAT kerning = -fontsize_h * item.m_Origin.x / 1000;
59         baseSpace = std::min(baseSpace, kerning + spacing);
60         bAllChar = false;
61       }
62     }
63     if (baseSpace < 0.0 || (nItems == 3 && !bAllChar))
64       baseSpace = 0.0;
65   }
66   return baseSpace;
67 }
68 
Unicode_GetNormalization(FX_WCHAR wch,FX_WCHAR * pDst)69 FX_STRSIZE Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst) {
70   wch = wch & 0xFFFF;
71   FX_WCHAR wFind = g_UnicodeData_Normalization[wch];
72   if (!wFind) {
73     if (pDst)
74       *pDst = wch;
75     return 1;
76   }
77   if (wFind >= 0x8000) {
78     wch = wFind - 0x8000;
79     wFind = 1;
80   } else {
81     wch = wFind & 0x0FFF;
82     wFind >>= 12;
83   }
84   const uint16_t* pMap = g_UnicodeData_Normalization_Maps[wFind];
85   if (pMap == g_UnicodeData_Normalization_Map4) {
86     pMap = g_UnicodeData_Normalization_Map4 + wch;
87     wFind = (FX_WCHAR)(*pMap++);
88   } else {
89     pMap += wch;
90   }
91   if (pDst) {
92     FX_WCHAR n = wFind;
93     while (n--)
94       *pDst++ = *pMap++;
95   }
96   return (FX_STRSIZE)wFind;
97 }
98 
MaskPercentFilled(const std::vector<bool> & mask,int32_t start,int32_t end)99 float MaskPercentFilled(const std::vector<bool>& mask,
100                         int32_t start,
101                         int32_t end) {
102   if (start >= end)
103     return 0;
104   float count = std::count_if(mask.begin() + start, mask.begin() + end,
105                               [](bool r) { return r; });
106   return count / (end - start);
107 }
108 
109 }  // namespace
110 
FPDF_CHAR_INFO()111 FPDF_CHAR_INFO::FPDF_CHAR_INFO()
112     : m_Unicode(0),
113       m_Charcode(0),
114       m_Flag(0),
115       m_FontSize(0),
116       m_pTextObj(nullptr) {}
117 
~FPDF_CHAR_INFO()118 FPDF_CHAR_INFO::~FPDF_CHAR_INFO() {}
119 
PAGECHAR_INFO()120 PAGECHAR_INFO::PAGECHAR_INFO()
121     : m_Index(0), m_CharCode(0), m_Unicode(0), m_Flag(0), m_pTextObj(nullptr) {}
122 
123 PAGECHAR_INFO::PAGECHAR_INFO(const PAGECHAR_INFO&) = default;
124 
~PAGECHAR_INFO()125 PAGECHAR_INFO::~PAGECHAR_INFO() {}
126 
CPDF_TextPage(const CPDF_Page * pPage,FPDFText_Direction flags)127 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, FPDFText_Direction flags)
128     : m_pPage(pPage),
129       m_parserflag(flags),
130       m_pPreTextObj(nullptr),
131       m_bIsParsed(false),
132       m_TextlineDir(TextOrientation::Unknown) {
133   m_TextBuf.EstimateSize(0, 10240);
134   m_DisplayMatrix =
135       pPage->GetDisplayMatrix(0, 0, static_cast<int>(pPage->GetPageWidth()),
136                               static_cast<int>(pPage->GetPageHeight()), 0);
137 }
138 
~CPDF_TextPage()139 CPDF_TextPage::~CPDF_TextPage() {}
140 
IsControlChar(const PAGECHAR_INFO & charInfo)141 bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) {
142   switch (charInfo.m_Unicode) {
143     case 0x2:
144     case 0x3:
145     case 0x93:
146     case 0x94:
147     case 0x96:
148     case 0x97:
149     case 0x98:
150     case 0xfffe:
151       return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN;
152     default:
153       return false;
154   }
155 }
156 
ParseTextPage()157 void CPDF_TextPage::ParseTextPage() {
158   m_bIsParsed = false;
159   m_TextBuf.Clear();
160   m_CharList.clear();
161   m_pPreTextObj = nullptr;
162   ProcessObject();
163 
164   m_bIsParsed = true;
165   m_CharIndex.clear();
166   int nCount = pdfium::CollectionSize<int>(m_CharList);
167   if (nCount)
168     m_CharIndex.push_back(0);
169 
170   for (int i = 0; i < nCount; i++) {
171     int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
172     const PAGECHAR_INFO& charinfo = m_CharList[i];
173     if (charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED ||
174         (charinfo.m_Unicode != 0 && !IsControlChar(charinfo))) {
175       if (indexSize % 2) {
176         m_CharIndex.push_back(1);
177       } else {
178         if (indexSize <= 0)
179           continue;
180         m_CharIndex[indexSize - 1] += 1;
181       }
182     } else {
183       if (indexSize % 2) {
184         if (indexSize <= 0)
185           continue;
186         m_CharIndex[indexSize - 1] = i + 1;
187       } else {
188         m_CharIndex.push_back(i + 1);
189       }
190     }
191   }
192   int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
193   if (indexSize % 2)
194     m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
195 }
196 
CountChars() const197 int CPDF_TextPage::CountChars() const {
198   return pdfium::CollectionSize<int>(m_CharList);
199 }
200 
CharIndexFromTextIndex(int TextIndex) const201 int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const {
202   int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
203   int count = 0;
204   for (int i = 0; i < indexSize; i += 2) {
205     count += m_CharIndex[i + 1];
206     if (count > TextIndex)
207       return TextIndex - count + m_CharIndex[i + 1] + m_CharIndex[i];
208   }
209   return -1;
210 }
211 
TextIndexFromCharIndex(int CharIndex) const212 int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const {
213   int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
214   int count = 0;
215   for (int i = 0; i < indexSize; i += 2) {
216     count += m_CharIndex[i + 1];
217     if (m_CharIndex[i + 1] + m_CharIndex[i] > CharIndex) {
218       if (CharIndex - m_CharIndex[i] < 0)
219         return -1;
220 
221       return CharIndex - m_CharIndex[i] + count - m_CharIndex[i + 1];
222     }
223   }
224   return -1;
225 }
226 
GetRectArray(int start,int nCount) const227 std::vector<CFX_FloatRect> CPDF_TextPage::GetRectArray(int start,
228                                                        int nCount) const {
229   if (start < 0 || nCount == 0 || !m_bIsParsed)
230     return std::vector<CFX_FloatRect>();
231 
232   if (nCount + start > pdfium::CollectionSize<int>(m_CharList) ||
233       nCount == -1) {
234     nCount = pdfium::CollectionSize<int>(m_CharList) - start;
235   }
236 
237   std::vector<CFX_FloatRect> rectArray;
238   CPDF_TextObject* pCurObj = nullptr;
239   CFX_FloatRect rect;
240   int curPos = start;
241   bool bFlagNewRect = true;
242   while (nCount--) {
243     PAGECHAR_INFO info_curchar = m_CharList[curPos++];
244     if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED)
245       continue;
246     if (info_curchar.m_CharBox.Width() < 0.01 ||
247         info_curchar.m_CharBox.Height() < 0.01) {
248       continue;
249     }
250     if (!pCurObj)
251       pCurObj = info_curchar.m_pTextObj;
252     if (pCurObj != info_curchar.m_pTextObj) {
253       rectArray.push_back(rect);
254       pCurObj = info_curchar.m_pTextObj;
255       bFlagNewRect = true;
256     }
257     if (bFlagNewRect) {
258       CFX_Matrix matrix = info_curchar.m_pTextObj->GetTextMatrix();
259       matrix.Concat(info_curchar.m_Matrix);
260 
261       CFX_Matrix matrix_reverse;
262       matrix_reverse.SetReverse(matrix);
263 
264       CFX_PointF origin = matrix_reverse.Transform(info_curchar.m_Origin);
265       rect.left = info_curchar.m_CharBox.left;
266       rect.right = info_curchar.m_CharBox.right;
267       if (pCurObj->GetFont()->GetTypeDescent()) {
268         rect.bottom = origin.y +
269                       pCurObj->GetFont()->GetTypeDescent() *
270                           pCurObj->GetFontSize() / 1000;
271 
272         rect.bottom = matrix.Transform(CFX_PointF(origin.x, rect.bottom)).y;
273       } else {
274         rect.bottom = info_curchar.m_CharBox.bottom;
275       }
276       if (pCurObj->GetFont()->GetTypeAscent()) {
277         rect.top =
278             origin.y +
279             pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000;
280         FX_FLOAT xPosTemp =
281             origin.x +
282             GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) *
283                 pCurObj->GetFontSize() / 1000;
284         rect.top = matrix.Transform(CFX_PointF(xPosTemp, rect.top)).y;
285       } else {
286         rect.top = info_curchar.m_CharBox.top;
287       }
288       bFlagNewRect = false;
289       rect = info_curchar.m_CharBox;
290       rect.Normalize();
291     } else {
292       info_curchar.m_CharBox.Normalize();
293       rect.left = std::min(rect.left, info_curchar.m_CharBox.left);
294       rect.right = std::max(rect.right, info_curchar.m_CharBox.right);
295       rect.top = std::max(rect.top, info_curchar.m_CharBox.top);
296       rect.bottom = std::min(rect.bottom, info_curchar.m_CharBox.bottom);
297     }
298   }
299   rectArray.push_back(rect);
300   return rectArray;
301 }
302 
GetIndexAtPos(const CFX_PointF & point,const CFX_SizeF & tolerance) const303 int CPDF_TextPage::GetIndexAtPos(const CFX_PointF& point,
304                                  const CFX_SizeF& tolerance) const {
305   if (!m_bIsParsed)
306     return -3;
307 
308   int pos = 0;
309   int NearPos = -1;
310   double xdif = 5000;
311   double ydif = 5000;
312   while (pos < pdfium::CollectionSize<int>(m_CharList)) {
313     PAGECHAR_INFO charinfo = m_CharList[pos];
314     CFX_FloatRect charrect = charinfo.m_CharBox;
315     if (charrect.Contains(point))
316       break;
317     if (tolerance.width > 0 || tolerance.height > 0) {
318       CFX_FloatRect charRectExt;
319       charrect.Normalize();
320       charRectExt.left = charrect.left - tolerance.width / 2;
321       charRectExt.right = charrect.right + tolerance.width / 2;
322       charRectExt.top = charrect.top + tolerance.height / 2;
323       charRectExt.bottom = charrect.bottom - tolerance.height / 2;
324       if (charRectExt.Contains(point)) {
325         double curXdif, curYdif;
326         curXdif = FXSYS_fabs(point.x - charrect.left) <
327                           FXSYS_fabs(point.x - charrect.right)
328                       ? FXSYS_fabs(point.x - charrect.left)
329                       : FXSYS_fabs(point.x - charrect.right);
330         curYdif = FXSYS_fabs(point.y - charrect.bottom) <
331                           FXSYS_fabs(point.y - charrect.top)
332                       ? FXSYS_fabs(point.y - charrect.bottom)
333                       : FXSYS_fabs(point.y - charrect.top);
334         if (curYdif + curXdif < xdif + ydif) {
335           ydif = curYdif;
336           xdif = curXdif;
337           NearPos = pos;
338         }
339       }
340     }
341     ++pos;
342   }
343   return pos < pdfium::CollectionSize<int>(m_CharList) ? pos : NearPos;
344 }
345 
GetTextByRect(const CFX_FloatRect & rect) const346 CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
347   if (!m_bIsParsed)
348     return CFX_WideString();
349 
350   FX_FLOAT posy = 0;
351   bool IsContainPreChar = false;
352   bool IsAddLineFeed = false;
353   CFX_WideString strText;
354   for (const auto& charinfo : m_CharList) {
355     if (IsRectIntersect(rect, charinfo.m_CharBox)) {
356       if (FXSYS_fabs(posy - charinfo.m_Origin.y) > 0 && !IsContainPreChar &&
357           IsAddLineFeed) {
358         posy = charinfo.m_Origin.y;
359         if (!strText.IsEmpty())
360           strText += L"\r\n";
361       }
362       IsContainPreChar = true;
363       IsAddLineFeed = false;
364       if (charinfo.m_Unicode)
365         strText += charinfo.m_Unicode;
366     } else if (charinfo.m_Unicode == 32) {
367       if (IsContainPreChar && charinfo.m_Unicode) {
368         strText += charinfo.m_Unicode;
369         IsContainPreChar = false;
370         IsAddLineFeed = false;
371       }
372     } else {
373       IsContainPreChar = false;
374       IsAddLineFeed = true;
375     }
376   }
377   return strText;
378 }
379 
GetCharInfo(int index,FPDF_CHAR_INFO * info) const380 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const {
381   if (!m_bIsParsed)
382     return;
383 
384   if (index < 0 || index >= pdfium::CollectionSize<int>(m_CharList))
385     return;
386 
387   const PAGECHAR_INFO& charinfo = m_CharList[index];
388   info->m_Charcode = charinfo.m_CharCode;
389   info->m_Origin = charinfo.m_Origin;
390   info->m_Unicode = charinfo.m_Unicode;
391   info->m_Flag = charinfo.m_Flag;
392   info->m_CharBox = charinfo.m_CharBox;
393   info->m_pTextObj = charinfo.m_pTextObj;
394   if (charinfo.m_pTextObj && charinfo.m_pTextObj->GetFont())
395     info->m_FontSize = charinfo.m_pTextObj->GetFontSize();
396   else
397     info->m_FontSize = kDefaultFontSize;
398   info->m_Matrix = charinfo.m_Matrix;
399 }
400 
CheckMarkedContentObject(int32_t & start,int32_t & nCount) const401 void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
402                                              int32_t& nCount) const {
403   PAGECHAR_INFO charinfo = m_CharList[start];
404   PAGECHAR_INFO charinfo2 = m_CharList[start + nCount - 1];
405   if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag &&
406       FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) {
407     return;
408   }
409   if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) {
410     PAGECHAR_INFO charinfo1 = charinfo;
411     int startIndex = start;
412     while (FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag &&
413            charinfo1.m_Index == charinfo.m_Index) {
414       startIndex--;
415       if (startIndex < 0)
416         break;
417       charinfo1 = m_CharList[startIndex];
418     }
419     startIndex++;
420     start = startIndex;
421   }
422   if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) {
423     PAGECHAR_INFO charinfo3 = charinfo2;
424     int endIndex = start + nCount - 1;
425     while (FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag &&
426            charinfo3.m_Index == charinfo2.m_Index) {
427       endIndex++;
428       if (endIndex >= pdfium::CollectionSize<int>(m_CharList))
429         break;
430       charinfo3 = m_CharList[endIndex];
431     }
432     endIndex--;
433     nCount = endIndex - start + 1;
434   }
435 }
436 
GetPageText(int start,int nCount) const437 CFX_WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
438   if (!m_bIsParsed || nCount == 0)
439     return L"";
440 
441   if (start < 0)
442     start = 0;
443 
444   if (nCount == -1) {
445     nCount = pdfium::CollectionSize<int>(m_CharList) - start;
446     return CFX_WideString(
447         m_TextBuf.AsStringC().Mid(start, m_TextBuf.AsStringC().GetLength()));
448   }
449   if (nCount <= 0 || m_CharList.empty())
450     return L"";
451   if (nCount + start > pdfium::CollectionSize<int>(m_CharList) - 1)
452     nCount = pdfium::CollectionSize<int>(m_CharList) - start;
453   if (nCount <= 0)
454     return L"";
455   CheckMarkedContentObject(start, nCount);
456   int startindex = 0;
457   PAGECHAR_INFO charinfo = m_CharList[start];
458   int startOffset = 0;
459   while (charinfo.m_Index == -1) {
460     startOffset++;
461     if (startOffset > nCount ||
462         start + startOffset >= pdfium::CollectionSize<int>(m_CharList)) {
463       return L"";
464     }
465     charinfo = m_CharList[start + startOffset];
466   }
467   startindex = charinfo.m_Index;
468   charinfo = m_CharList[start + nCount - 1];
469   int nCountOffset = 0;
470   while (charinfo.m_Index == -1) {
471     nCountOffset++;
472     if (nCountOffset >= nCount)
473       return L"";
474     charinfo = m_CharList[start + nCount - nCountOffset - 1];
475   }
476   nCount = start + nCount - nCountOffset - startindex;
477   if (nCount <= 0)
478     return L"";
479   return CFX_WideString(m_TextBuf.AsStringC().Mid(startindex, nCount));
480 }
481 
CountRects(int start,int nCount)482 int CPDF_TextPage::CountRects(int start, int nCount) {
483   if (!m_bIsParsed || start < 0)
484     return -1;
485 
486   if (nCount == -1 ||
487       nCount + start > pdfium::CollectionSize<int>(m_CharList)) {
488     nCount = pdfium::CollectionSize<int>(m_CharList) - start;
489   }
490   m_SelRects = GetRectArray(start, nCount);
491   return pdfium::CollectionSize<int>(m_SelRects);
492 }
493 
GetRect(int rectIndex,FX_FLOAT & left,FX_FLOAT & top,FX_FLOAT & right,FX_FLOAT & bottom) const494 void CPDF_TextPage::GetRect(int rectIndex,
495                             FX_FLOAT& left,
496                             FX_FLOAT& top,
497                             FX_FLOAT& right,
498                             FX_FLOAT& bottom) const {
499   if (!m_bIsParsed)
500     return;
501 
502   if (rectIndex < 0 || rectIndex >= pdfium::CollectionSize<int>(m_SelRects))
503     return;
504 
505   left = m_SelRects[rectIndex].left;
506   top = m_SelRects[rectIndex].top;
507   right = m_SelRects[rectIndex].right;
508   bottom = m_SelRects[rectIndex].bottom;
509 }
510 
FindTextlineFlowOrientation() const511 CPDF_TextPage::TextOrientation CPDF_TextPage::FindTextlineFlowOrientation()
512     const {
513   if (m_pPage->GetPageObjectList()->empty())
514     return TextOrientation::Unknown;
515 
516   const int32_t nPageWidth = static_cast<int32_t>(m_pPage->GetPageWidth());
517   const int32_t nPageHeight = static_cast<int32_t>(m_pPage->GetPageHeight());
518   if (nPageWidth <= 0 || nPageHeight <= 0)
519     return TextOrientation::Unknown;
520 
521   std::vector<bool> nHorizontalMask(nPageWidth);
522   std::vector<bool> nVerticalMask(nPageHeight);
523   FX_FLOAT fLineHeight = 0.0f;
524   int32_t nStartH = nPageWidth;
525   int32_t nEndH = 0;
526   int32_t nStartV = nPageHeight;
527   int32_t nEndV = 0;
528   for (const auto& pPageObj : *m_pPage->GetPageObjectList()) {
529     if (!pPageObj->IsText())
530       continue;
531 
532     int32_t minH = std::max(static_cast<int32_t>(pPageObj->m_Left), 0);
533     int32_t maxH =
534         std::min(static_cast<int32_t>(pPageObj->m_Right), nPageWidth);
535     int32_t minV = std::max(static_cast<int32_t>(pPageObj->m_Bottom), 0);
536     int32_t maxV = std::min(static_cast<int32_t>(pPageObj->m_Top), nPageHeight);
537     if (minH >= maxH || minV >= maxV)
538       continue;
539 
540     for (int32_t i = minH; i < maxH; ++i)
541       nHorizontalMask[i] = true;
542     for (int32_t i = minV; i < maxV; ++i)
543       nVerticalMask[i] = true;
544 
545     nStartH = std::min(nStartH, minH);
546     nEndH = std::max(nEndH, maxH);
547     nStartV = std::min(nStartV, minV);
548     nEndV = std::max(nEndV, maxV);
549 
550     if (fLineHeight <= 0.0f)
551       fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom;
552   }
553   const int32_t nDoubleLineHeight = 2 * fLineHeight;
554   if ((nEndV - nStartV) < nDoubleLineHeight)
555     return TextOrientation::Horizontal;
556   if ((nEndH - nStartH) < nDoubleLineHeight)
557     return TextOrientation::Vertical;
558 
559   const FX_FLOAT nSumH = MaskPercentFilled(nHorizontalMask, nStartH, nEndH);
560   if (nSumH > 0.8f)
561     return TextOrientation::Horizontal;
562 
563   const FX_FLOAT nSumV = MaskPercentFilled(nVerticalMask, nStartV, nEndV);
564   if (nSumH > nSumV)
565     return TextOrientation::Horizontal;
566   if (nSumH < nSumV)
567     return TextOrientation::Vertical;
568   return TextOrientation::Unknown;
569 }
570 
AppendGeneratedCharacter(FX_WCHAR unicode,const CFX_Matrix & formMatrix)571 void CPDF_TextPage::AppendGeneratedCharacter(FX_WCHAR unicode,
572                                              const CFX_Matrix& formMatrix) {
573   PAGECHAR_INFO generateChar;
574   if (!GenerateCharInfo(unicode, generateChar))
575     return;
576 
577   m_TextBuf.AppendChar(unicode);
578   if (!formMatrix.IsIdentity())
579     generateChar.m_Matrix = formMatrix;
580   m_CharList.push_back(generateChar);
581 }
582 
ProcessObject()583 void CPDF_TextPage::ProcessObject() {
584   if (m_pPage->GetPageObjectList()->empty())
585     return;
586 
587   m_TextlineDir = FindTextlineFlowOrientation();
588   const CPDF_PageObjectList* pObjList = m_pPage->GetPageObjectList();
589   for (auto it = pObjList->begin(); it != pObjList->end(); ++it) {
590     if (CPDF_PageObject* pObj = it->get()) {
591       if (pObj->IsText()) {
592         CFX_Matrix matrix;
593         ProcessTextObject(pObj->AsText(), matrix, pObjList, it);
594       } else if (pObj->IsForm()) {
595         CFX_Matrix formMatrix(1, 0, 0, 1, 0, 0);
596         ProcessFormObject(pObj->AsForm(), formMatrix);
597       }
598     }
599   }
600   for (const auto& obj : m_LineObj)
601     ProcessTextObject(obj);
602 
603   m_LineObj.clear();
604   CloseTempLine();
605 }
606 
ProcessFormObject(CPDF_FormObject * pFormObj,const CFX_Matrix & formMatrix)607 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj,
608                                       const CFX_Matrix& formMatrix) {
609   CPDF_PageObjectList* pObjectList = pFormObj->m_pForm->GetPageObjectList();
610   if (pObjectList->empty())
611     return;
612 
613   CFX_Matrix curFormMatrix;
614   curFormMatrix = pFormObj->m_FormMatrix;
615   curFormMatrix.Concat(formMatrix);
616 
617   for (auto it = pObjectList->begin(); it != pObjectList->end(); ++it) {
618     if (CPDF_PageObject* pPageObj = it->get()) {
619       if (pPageObj->IsText())
620         ProcessTextObject(pPageObj->AsText(), curFormMatrix, pObjectList, it);
621       else if (pPageObj->IsForm())
622         ProcessFormObject(pPageObj->AsForm(), curFormMatrix);
623     }
624   }
625 }
626 
GetCharWidth(uint32_t charCode,CPDF_Font * pFont) const627 int CPDF_TextPage::GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const {
628   if (charCode == CPDF_Font::kInvalidCharCode)
629     return 0;
630 
631   if (int w = pFont->GetCharWidthF(charCode))
632     return w;
633 
634   CFX_ByteString str;
635   pFont->AppendChar(str, charCode);
636   if (int w = pFont->GetStringWidth(str.c_str(), 1))
637     return w;
638 
639   return pFont->GetCharBBox(charCode).Width();
640 }
641 
AddCharInfoByLRDirection(FX_WCHAR wChar,PAGECHAR_INFO info)642 void CPDF_TextPage::AddCharInfoByLRDirection(FX_WCHAR wChar,
643                                              PAGECHAR_INFO info) {
644   if (IsControlChar(info)) {
645     info.m_Index = -1;
646     m_CharList.push_back(info);
647     return;
648   }
649 
650   info.m_Index = m_TextBuf.GetLength();
651   if (wChar >= 0xFB00 && wChar <= 0xFB06) {
652     FX_WCHAR* pDst = nullptr;
653     FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst);
654     if (nCount >= 1) {
655       pDst = FX_Alloc(FX_WCHAR, nCount);
656       Unicode_GetNormalization(wChar, pDst);
657       for (int nIndex = 0; nIndex < nCount; nIndex++) {
658         PAGECHAR_INFO info2 = info;
659         info2.m_Unicode = pDst[nIndex];
660         info2.m_Flag = FPDFTEXT_CHAR_PIECE;
661         m_TextBuf.AppendChar(info2.m_Unicode);
662         m_CharList.push_back(info2);
663       }
664       FX_Free(pDst);
665       return;
666     }
667   }
668   m_TextBuf.AppendChar(wChar);
669   m_CharList.push_back(info);
670 }
671 
AddCharInfoByRLDirection(FX_WCHAR wChar,PAGECHAR_INFO info)672 void CPDF_TextPage::AddCharInfoByRLDirection(FX_WCHAR wChar,
673                                              PAGECHAR_INFO info) {
674   if (IsControlChar(info)) {
675     info.m_Index = -1;
676     m_CharList.push_back(info);
677     return;
678   }
679 
680   info.m_Index = m_TextBuf.GetLength();
681   wChar = FX_GetMirrorChar(wChar, true, false);
682   FX_WCHAR* pDst = nullptr;
683   FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst);
684   if (nCount >= 1) {
685     pDst = FX_Alloc(FX_WCHAR, nCount);
686     Unicode_GetNormalization(wChar, pDst);
687     for (int nIndex = 0; nIndex < nCount; nIndex++) {
688       PAGECHAR_INFO info2 = info;
689       info2.m_Unicode = pDst[nIndex];
690       info2.m_Flag = FPDFTEXT_CHAR_PIECE;
691       m_TextBuf.AppendChar(info2.m_Unicode);
692       m_CharList.push_back(info2);
693     }
694     FX_Free(pDst);
695     return;
696   }
697   info.m_Unicode = wChar;
698   m_TextBuf.AppendChar(info.m_Unicode);
699   m_CharList.push_back(info);
700 }
701 
CloseTempLine()702 void CPDF_TextPage::CloseTempLine() {
703   if (m_TempCharList.empty())
704     return;
705 
706   CFX_WideString str = m_TempTextBuf.MakeString();
707   bool bPrevSpace = false;
708   for (int i = 0; i < str.GetLength(); i++) {
709     if (str.GetAt(i) != ' ') {
710       bPrevSpace = false;
711       continue;
712     }
713     if (bPrevSpace) {
714       m_TempTextBuf.Delete(i, 1);
715       m_TempCharList.erase(m_TempCharList.begin() + i);
716       str.Delete(i);
717       i--;
718     }
719     bPrevSpace = true;
720   }
721   CFX_BidiString bidi(str);
722   if (m_parserflag == FPDFText_Direction::Right)
723     bidi.SetOverallDirectionRight();
724   CFX_BidiChar::Direction eCurrentDirection = bidi.OverallDirection();
725   for (const auto& segment : bidi) {
726     if (segment.direction == CFX_BidiChar::RIGHT ||
727         (segment.direction == CFX_BidiChar::NEUTRAL &&
728          eCurrentDirection == CFX_BidiChar::RIGHT)) {
729       eCurrentDirection = CFX_BidiChar::RIGHT;
730       for (int m = segment.start + segment.count; m > segment.start; --m)
731         AddCharInfoByRLDirection(bidi.CharAt(m - 1), m_TempCharList[m - 1]);
732     } else {
733       eCurrentDirection = CFX_BidiChar::LEFT;
734       for (int m = segment.start; m < segment.start + segment.count; m++)
735         AddCharInfoByLRDirection(bidi.CharAt(m), m_TempCharList[m]);
736     }
737   }
738   m_TempCharList.clear();
739   m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
740 }
741 
ProcessTextObject(CPDF_TextObject * pTextObj,const CFX_Matrix & formMatrix,const CPDF_PageObjectList * pObjList,CPDF_PageObjectList::const_iterator ObjPos)742 void CPDF_TextPage::ProcessTextObject(
743     CPDF_TextObject* pTextObj,
744     const CFX_Matrix& formMatrix,
745     const CPDF_PageObjectList* pObjList,
746     CPDF_PageObjectList::const_iterator ObjPos) {
747   if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f)
748     return;
749 
750   size_t count = m_LineObj.size();
751   PDFTEXT_Obj Obj;
752   Obj.m_pTextObj = pTextObj;
753   Obj.m_formMatrix = formMatrix;
754   if (count == 0) {
755     m_LineObj.push_back(Obj);
756     return;
757   }
758   if (IsSameAsPreTextObject(pTextObj, pObjList, ObjPos))
759     return;
760 
761   PDFTEXT_Obj prev_Obj = m_LineObj[count - 1];
762   CPDF_TextObjectItem item;
763   int nItem = prev_Obj.m_pTextObj->CountItems();
764   prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item);
765   FX_FLOAT prev_width =
766       GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) *
767       prev_Obj.m_pTextObj->GetFontSize() / 1000;
768 
769   CFX_Matrix prev_matrix = prev_Obj.m_pTextObj->GetTextMatrix();
770   prev_width = FXSYS_fabs(prev_width);
771   prev_matrix.Concat(prev_Obj.m_formMatrix);
772   prev_width = prev_matrix.TransformDistance(prev_width);
773   pTextObj->GetItemInfo(0, &item);
774   FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) *
775                         pTextObj->GetFontSize() / 1000;
776   this_width = FXSYS_fabs(this_width);
777 
778   CFX_Matrix this_matrix = pTextObj->GetTextMatrix();
779   this_width = FXSYS_fabs(this_width);
780   this_matrix.Concat(formMatrix);
781   this_width = this_matrix.TransformDistance(this_width);
782 
783   FX_FLOAT threshold =
784       prev_width > this_width ? prev_width / 4 : this_width / 4;
785   CFX_PointF prev_pos = m_DisplayMatrix.Transform(
786       prev_Obj.m_formMatrix.Transform(prev_Obj.m_pTextObj->GetPos()));
787   CFX_PointF this_pos =
788       m_DisplayMatrix.Transform(formMatrix.Transform(pTextObj->GetPos()));
789   if (FXSYS_fabs(this_pos.y - prev_pos.y) > threshold * 2) {
790     for (size_t i = 0; i < count; i++)
791       ProcessTextObject(m_LineObj[i]);
792     m_LineObj.clear();
793     m_LineObj.push_back(Obj);
794     return;
795   }
796 
797   for (size_t i = count; i > 0; --i) {
798     PDFTEXT_Obj prev_text_obj = m_LineObj[i - 1];
799     CFX_PointF new_prev_pos =
800         m_DisplayMatrix.Transform(prev_text_obj.m_formMatrix.Transform(
801             prev_text_obj.m_pTextObj->GetPos()));
802     if (this_pos.x >= new_prev_pos.x) {
803       m_LineObj.insert(m_LineObj.begin() + i, Obj);
804       return;
805     }
806   }
807   m_LineObj.insert(m_LineObj.begin(), Obj);
808 }
809 
PreMarkedContent(PDFTEXT_Obj Obj)810 FPDFText_MarkedContent CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) {
811   CPDF_TextObject* pTextObj = Obj.m_pTextObj;
812   if (!pTextObj->m_ContentMark)
813     return FPDFText_MarkedContent::Pass;
814 
815   int nContentMark = pTextObj->m_ContentMark.CountItems();
816   if (nContentMark < 1)
817     return FPDFText_MarkedContent::Pass;
818 
819   CFX_WideString actText;
820   bool bExist = false;
821   CPDF_Dictionary* pDict = nullptr;
822   int n = 0;
823   for (n = 0; n < nContentMark; n++) {
824     const CPDF_ContentMarkItem& item = pTextObj->m_ContentMark.GetItem(n);
825     pDict = item.GetParam();
826     if (!pDict)
827       continue;
828     CPDF_String* temp = ToString(pDict->GetObjectFor("ActualText"));
829     if (temp) {
830       bExist = true;
831       actText = temp->GetUnicodeText();
832     }
833   }
834   if (!bExist)
835     return FPDFText_MarkedContent::Pass;
836 
837   if (m_pPreTextObj && m_pPreTextObj->m_ContentMark &&
838       m_pPreTextObj->m_ContentMark.CountItems() == n &&
839       pDict == m_pPreTextObj->m_ContentMark.GetItem(n - 1).GetParam()) {
840     return FPDFText_MarkedContent::Done;
841   }
842 
843   FX_STRSIZE nItems = actText.GetLength();
844   if (nItems < 1)
845     return FPDFText_MarkedContent::Pass;
846 
847   CPDF_Font* pFont = pTextObj->GetFont();
848   bExist = false;
849   for (FX_STRSIZE i = 0; i < nItems; i++) {
850     if (pFont->CharCodeFromUnicode(actText.GetAt(i)) !=
851         CPDF_Font::kInvalidCharCode) {
852       bExist = true;
853       break;
854     }
855   }
856   if (!bExist)
857     return FPDFText_MarkedContent::Pass;
858 
859   bExist = false;
860   for (FX_STRSIZE i = 0; i < nItems; i++) {
861     FX_WCHAR wChar = actText.GetAt(i);
862     if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
863       bExist = true;
864       break;
865     }
866   }
867   if (!bExist)
868     return FPDFText_MarkedContent::Done;
869 
870   return FPDFText_MarkedContent::Delay;
871 }
872 
ProcessMarkedContent(PDFTEXT_Obj Obj)873 void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) {
874   CPDF_TextObject* pTextObj = Obj.m_pTextObj;
875   if (!pTextObj->m_ContentMark)
876     return;
877 
878   int nContentMark = pTextObj->m_ContentMark.CountItems();
879   if (nContentMark < 1)
880     return;
881 
882   CFX_WideString actText;
883   for (int n = 0; n < nContentMark; n++) {
884     const CPDF_ContentMarkItem& item = pTextObj->m_ContentMark.GetItem(n);
885     CPDF_Dictionary* pDict = item.GetParam();
886     if (pDict)
887       actText = pDict->GetUnicodeTextFor("ActualText");
888   }
889   FX_STRSIZE nItems = actText.GetLength();
890   if (nItems < 1)
891     return;
892 
893   CPDF_Font* pFont = pTextObj->GetFont();
894   CFX_Matrix matrix = pTextObj->GetTextMatrix();
895   matrix.Concat(Obj.m_formMatrix);
896 
897   for (FX_STRSIZE k = 0; k < nItems; k++) {
898     FX_WCHAR wChar = actText.GetAt(k);
899     if (wChar <= 0x80 && !isprint(wChar))
900       wChar = 0x20;
901     if (wChar >= 0xFFFD)
902       continue;
903 
904     PAGECHAR_INFO charinfo;
905     charinfo.m_Origin = pTextObj->GetPos();
906     charinfo.m_Index = m_TextBuf.GetLength();
907     charinfo.m_Unicode = wChar;
908     charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
909     charinfo.m_Flag = FPDFTEXT_CHAR_PIECE;
910     charinfo.m_pTextObj = pTextObj;
911     charinfo.m_CharBox = pTextObj->GetRect();
912     charinfo.m_Matrix = matrix;
913     m_TempTextBuf.AppendChar(wChar);
914     m_TempCharList.push_back(charinfo);
915   }
916 }
917 
FindPreviousTextObject()918 void CPDF_TextPage::FindPreviousTextObject() {
919   if (m_TempCharList.empty() && m_CharList.empty())
920     return;
921 
922   PAGECHAR_INFO preChar =
923       m_TempCharList.empty() ? m_CharList.back() : m_TempCharList.back();
924 
925   if (preChar.m_pTextObj)
926     m_pPreTextObj = preChar.m_pTextObj;
927 }
928 
SwapTempTextBuf(int32_t iCharListStartAppend,int32_t iBufStartAppend)929 void CPDF_TextPage::SwapTempTextBuf(int32_t iCharListStartAppend,
930                                     int32_t iBufStartAppend) {
931   int32_t i = iCharListStartAppend;
932   int32_t j = pdfium::CollectionSize<int32_t>(m_TempCharList) - 1;
933   for (; i < j; i++, j--) {
934     std::swap(m_TempCharList[i], m_TempCharList[j]);
935     std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index);
936   }
937   FX_WCHAR* pTempBuffer = m_TempTextBuf.GetBuffer();
938   i = iBufStartAppend;
939   j = m_TempTextBuf.GetLength() - 1;
940   for (; i < j; i++, j--)
941     std::swap(pTempBuffer[i], pTempBuffer[j]);
942 }
943 
IsRightToLeft(const CPDF_TextObject * pTextObj,const CPDF_Font * pFont,int nItems) const944 bool CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj,
945                                   const CPDF_Font* pFont,
946                                   int nItems) const {
947   CFX_WideString str;
948   for (int32_t i = 0; i < nItems; i++) {
949     CPDF_TextObjectItem item;
950     pTextObj->GetItemInfo(i, &item);
951     if (item.m_CharCode == static_cast<uint32_t>(-1))
952       continue;
953     CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
954     FX_WCHAR wChar = wstrItem.GetAt(0);
955     if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode)
956       wChar = (FX_WCHAR)item.m_CharCode;
957     if (wChar)
958       str += wChar;
959   }
960   return CFX_BidiString(str).OverallDirection() == CFX_BidiChar::RIGHT;
961 }
962 
ProcessTextObject(PDFTEXT_Obj Obj)963 void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
964   CPDF_TextObject* pTextObj = Obj.m_pTextObj;
965   if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f)
966     return;
967   CFX_Matrix formMatrix = Obj.m_formMatrix;
968   CPDF_Font* pFont = pTextObj->GetFont();
969   CFX_Matrix matrix = pTextObj->GetTextMatrix();
970   matrix.Concat(formMatrix);
971 
972   FPDFText_MarkedContent ePreMKC = PreMarkedContent(Obj);
973   if (ePreMKC == FPDFText_MarkedContent::Done) {
974     m_pPreTextObj = pTextObj;
975     m_perMatrix = formMatrix;
976     return;
977   }
978   GenerateCharacter result = GenerateCharacter::None;
979   if (m_pPreTextObj) {
980     result = ProcessInsertObject(pTextObj, formMatrix);
981     if (result == GenerateCharacter::LineBreak)
982       m_CurlineRect = Obj.m_pTextObj->GetRect();
983     else
984       m_CurlineRect.Union(Obj.m_pTextObj->GetRect());
985 
986     switch (result) {
987       case GenerateCharacter::None:
988         break;
989       case GenerateCharacter::Space: {
990         PAGECHAR_INFO generateChar;
991         if (GenerateCharInfo(TEXT_SPACE_CHAR, generateChar)) {
992           if (!formMatrix.IsIdentity())
993             generateChar.m_Matrix = formMatrix;
994           m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR);
995           m_TempCharList.push_back(generateChar);
996         }
997         break;
998       }
999       case GenerateCharacter::LineBreak:
1000         CloseTempLine();
1001         if (m_TextBuf.GetSize()) {
1002           AppendGeneratedCharacter(TEXT_RETURN_CHAR, formMatrix);
1003           AppendGeneratedCharacter(TEXT_LINEFEED_CHAR, formMatrix);
1004         }
1005         break;
1006       case GenerateCharacter::Hyphen:
1007         if (pTextObj->CountChars() == 1) {
1008           CPDF_TextObjectItem item;
1009           pTextObj->GetCharInfo(0, &item);
1010           CFX_WideString wstrItem =
1011               pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1012           if (wstrItem.IsEmpty())
1013             wstrItem += (FX_WCHAR)item.m_CharCode;
1014           FX_WCHAR curChar = wstrItem.GetAt(0);
1015           if (curChar == 0x2D || curChar == 0xAD)
1016             return;
1017         }
1018         while (m_TempTextBuf.GetSize() > 0 &&
1019                m_TempTextBuf.AsStringC().GetAt(m_TempTextBuf.GetLength() - 1) ==
1020                    0x20) {
1021           m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1022           m_TempCharList.pop_back();
1023         }
1024         PAGECHAR_INFO* charinfo = &m_TempCharList.back();
1025         m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1026         charinfo->m_Unicode = 0x2;
1027         charinfo->m_Flag = FPDFTEXT_CHAR_HYPHEN;
1028         m_TempTextBuf.AppendChar(0xfffe);
1029         break;
1030     }
1031   } else {
1032     m_CurlineRect = Obj.m_pTextObj->GetRect();
1033   }
1034 
1035   if (ePreMKC == FPDFText_MarkedContent::Delay) {
1036     ProcessMarkedContent(Obj);
1037     m_pPreTextObj = pTextObj;
1038     m_perMatrix = formMatrix;
1039     return;
1040   }
1041   m_pPreTextObj = pTextObj;
1042   m_perMatrix = formMatrix;
1043   int nItems = pTextObj->CountItems();
1044   FX_FLOAT baseSpace = CalculateBaseSpace(pTextObj, matrix);
1045 
1046   const bool bR2L = IsRightToLeft(pTextObj, pFont, nItems);
1047   const bool bIsBidiAndMirrorInverse =
1048       bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
1049   int32_t iBufStartAppend = m_TempTextBuf.GetLength();
1050   int32_t iCharListStartAppend =
1051       pdfium::CollectionSize<int32_t>(m_TempCharList);
1052 
1053   FX_FLOAT spacing = 0;
1054   for (int i = 0; i < nItems; i++) {
1055     CPDF_TextObjectItem item;
1056     PAGECHAR_INFO charinfo;
1057     pTextObj->GetItemInfo(i, &item);
1058     if (item.m_CharCode == static_cast<uint32_t>(-1)) {
1059       CFX_WideString str = m_TempTextBuf.MakeString();
1060       if (str.IsEmpty())
1061         str = m_TextBuf.AsStringC();
1062       if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_SPACE_CHAR)
1063         continue;
1064 
1065       FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1066       spacing = -fontsize_h * item.m_Origin.x / 1000;
1067       continue;
1068     }
1069     FX_FLOAT charSpace = pTextObj->m_TextState.GetCharSpace();
1070     if (charSpace > 0.001)
1071       spacing += matrix.TransformDistance(charSpace);
1072     else if (charSpace < -0.001)
1073       spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
1074     spacing -= baseSpace;
1075     if (spacing && i > 0) {
1076       int last_width = 0;
1077       FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1078       uint32_t space_charcode = pFont->CharCodeFromUnicode(' ');
1079       FX_FLOAT threshold = 0;
1080       if (space_charcode != CPDF_Font::kInvalidCharCode)
1081         threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
1082       if (threshold > fontsize_h / 3)
1083         threshold = 0;
1084       else
1085         threshold /= 2;
1086       if (threshold == 0) {
1087         threshold = fontsize_h;
1088         int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
1089         threshold = this_width > last_width ? (FX_FLOAT)this_width
1090                                             : (FX_FLOAT)last_width;
1091         threshold = NormalizeThreshold(threshold);
1092         threshold = fontsize_h * threshold / 1000;
1093       }
1094       if (threshold && (spacing && spacing >= threshold)) {
1095         charinfo.m_Unicode = TEXT_SPACE_CHAR;
1096         charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED;
1097         charinfo.m_pTextObj = pTextObj;
1098         charinfo.m_Index = m_TextBuf.GetLength();
1099         m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR);
1100         charinfo.m_CharCode = CPDF_Font::kInvalidCharCode;
1101         charinfo.m_Matrix = formMatrix;
1102         charinfo.m_Origin = matrix.Transform(item.m_Origin);
1103         charinfo.m_CharBox =
1104             CFX_FloatRect(charinfo.m_Origin.x, charinfo.m_Origin.y,
1105                           charinfo.m_Origin.x, charinfo.m_Origin.y);
1106         m_TempCharList.push_back(charinfo);
1107       }
1108       if (item.m_CharCode == CPDF_Font::kInvalidCharCode)
1109         continue;
1110     }
1111     spacing = 0;
1112     CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1113     bool bNoUnicode = false;
1114     if (wstrItem.IsEmpty() && item.m_CharCode) {
1115       wstrItem += static_cast<FX_WCHAR>(item.m_CharCode);
1116       bNoUnicode = true;
1117     }
1118     charinfo.m_Index = -1;
1119     charinfo.m_CharCode = item.m_CharCode;
1120     if (bNoUnicode)
1121       charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE;
1122     else
1123       charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL;
1124 
1125     charinfo.m_pTextObj = pTextObj;
1126     charinfo.m_Origin = matrix.Transform(item.m_Origin);
1127 
1128     FX_RECT rect =
1129         charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode);
1130     charinfo.m_CharBox.top =
1131         rect.top * pTextObj->GetFontSize() / 1000 + item.m_Origin.y;
1132     charinfo.m_CharBox.left =
1133         rect.left * pTextObj->GetFontSize() / 1000 + item.m_Origin.x;
1134     charinfo.m_CharBox.right =
1135         rect.right * pTextObj->GetFontSize() / 1000 + item.m_Origin.x;
1136     charinfo.m_CharBox.bottom =
1137         rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_Origin.y;
1138     if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) {
1139       charinfo.m_CharBox.top =
1140           charinfo.m_CharBox.bottom + pTextObj->GetFontSize();
1141     }
1142     if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) {
1143       charinfo.m_CharBox.right =
1144           charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
1145     }
1146     matrix.TransformRect(charinfo.m_CharBox);
1147     charinfo.m_Matrix = matrix;
1148     if (wstrItem.IsEmpty()) {
1149       charinfo.m_Unicode = 0;
1150       m_TempCharList.push_back(charinfo);
1151       m_TempTextBuf.AppendChar(0xfffe);
1152       continue;
1153     } else {
1154       int nTotal = wstrItem.GetLength();
1155       bool bDel = false;
1156       const int count =
1157           std::min(pdfium::CollectionSize<int>(m_TempCharList), 7);
1158       FX_FLOAT threshold = charinfo.m_Matrix.TransformXDistance(
1159           (FX_FLOAT)TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize());
1160       for (int n = pdfium::CollectionSize<int>(m_TempCharList);
1161            n > pdfium::CollectionSize<int>(m_TempCharList) - count; n--) {
1162         const PAGECHAR_INFO& charinfo1 = m_TempCharList[n - 1];
1163         CFX_PointF diff = charinfo1.m_Origin - charinfo.m_Origin;
1164         if (charinfo1.m_CharCode == charinfo.m_CharCode &&
1165             charinfo1.m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() &&
1166             FXSYS_fabs(diff.x) < threshold && FXSYS_fabs(diff.y) < threshold) {
1167           bDel = true;
1168           break;
1169         }
1170       }
1171       if (!bDel) {
1172         for (int nIndex = 0; nIndex < nTotal; nIndex++) {
1173           charinfo.m_Unicode = wstrItem.GetAt(nIndex);
1174           if (charinfo.m_Unicode) {
1175             charinfo.m_Index = m_TextBuf.GetLength();
1176             m_TempTextBuf.AppendChar(charinfo.m_Unicode);
1177           } else {
1178             m_TempTextBuf.AppendChar(0xfffe);
1179           }
1180           m_TempCharList.push_back(charinfo);
1181         }
1182       } else if (i == 0) {
1183         CFX_WideString str = m_TempTextBuf.MakeString();
1184         if (!str.IsEmpty() &&
1185             str.GetAt(str.GetLength() - 1) == TEXT_SPACE_CHAR) {
1186           m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1187           m_TempCharList.pop_back();
1188         }
1189       }
1190     }
1191   }
1192   if (bIsBidiAndMirrorInverse)
1193     SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
1194 }
1195 
GetTextObjectWritingMode(const CPDF_TextObject * pTextObj) const1196 CPDF_TextPage::TextOrientation CPDF_TextPage::GetTextObjectWritingMode(
1197     const CPDF_TextObject* pTextObj) const {
1198   int32_t nChars = pTextObj->CountChars();
1199   if (nChars == 1)
1200     return m_TextlineDir;
1201 
1202   CPDF_TextObjectItem first, last;
1203   pTextObj->GetCharInfo(0, &first);
1204   pTextObj->GetCharInfo(nChars - 1, &last);
1205 
1206   CFX_Matrix textMatrix = pTextObj->GetTextMatrix();
1207   first.m_Origin = textMatrix.Transform(first.m_Origin);
1208   last.m_Origin = textMatrix.Transform(last.m_Origin);
1209 
1210   FX_FLOAT dX = FXSYS_fabs(last.m_Origin.x - first.m_Origin.x);
1211   FX_FLOAT dY = FXSYS_fabs(last.m_Origin.y - first.m_Origin.y);
1212   if (dX <= 0.0001f && dY <= 0.0001f)
1213     return TextOrientation::Unknown;
1214 
1215   CFX_VectorF v(dX, dY);
1216   v.Normalize();
1217   if (v.y <= 0.0872f)
1218     return v.x <= 0.0872f ? m_TextlineDir : TextOrientation::Horizontal;
1219 
1220   if (v.x <= 0.0872f)
1221     return TextOrientation::Vertical;
1222 
1223   return m_TextlineDir;
1224 }
1225 
IsHyphen(FX_WCHAR curChar)1226 bool CPDF_TextPage::IsHyphen(FX_WCHAR curChar) {
1227   CFX_WideString strCurText = m_TempTextBuf.MakeString();
1228   if (strCurText.IsEmpty())
1229     strCurText = m_TextBuf.AsStringC();
1230   FX_STRSIZE nCount = strCurText.GetLength();
1231   int nIndex = nCount - 1;
1232   FX_WCHAR wcTmp = strCurText.GetAt(nIndex);
1233   while (wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0)
1234     wcTmp = strCurText.GetAt(--nIndex);
1235   if (0x2D == wcTmp || 0xAD == wcTmp) {
1236     if (--nIndex > 0) {
1237       FX_WCHAR preChar = strCurText.GetAt((nIndex));
1238       if (((preChar >= L'A' && preChar <= L'Z') ||
1239            (preChar >= L'a' && preChar <= L'z')) &&
1240           ((curChar >= L'A' && curChar <= L'Z') ||
1241            (curChar >= L'a' && curChar <= L'z'))) {
1242         return true;
1243       }
1244     }
1245     const PAGECHAR_INFO* preInfo;
1246     if (!m_TempCharList.empty())
1247       preInfo = &m_TempCharList.back();
1248     else if (!m_CharList.empty())
1249       preInfo = &m_CharList.back();
1250     else
1251       return false;
1252     if (FPDFTEXT_CHAR_PIECE == preInfo->m_Flag &&
1253         (0xAD == preInfo->m_Unicode || 0x2D == preInfo->m_Unicode)) {
1254       return true;
1255     }
1256   }
1257   return false;
1258 }
1259 
ProcessInsertObject(const CPDF_TextObject * pObj,const CFX_Matrix & formMatrix)1260 CPDF_TextPage::GenerateCharacter CPDF_TextPage::ProcessInsertObject(
1261     const CPDF_TextObject* pObj,
1262     const CFX_Matrix& formMatrix) {
1263   FindPreviousTextObject();
1264   TextOrientation WritingMode = GetTextObjectWritingMode(pObj);
1265   if (WritingMode == TextOrientation::Unknown)
1266     WritingMode = GetTextObjectWritingMode(m_pPreTextObj);
1267 
1268   CFX_FloatRect this_rect = pObj->GetRect();
1269   CFX_FloatRect prev_rect = m_pPreTextObj->GetRect();
1270   CPDF_TextObjectItem PrevItem;
1271   CPDF_TextObjectItem item;
1272   int nItem = m_pPreTextObj->CountItems();
1273   m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem);
1274   pObj->GetItemInfo(0, &item);
1275   CFX_WideString wstrItem =
1276       pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1277   if (wstrItem.IsEmpty())
1278     wstrItem += static_cast<FX_WCHAR>(item.m_CharCode);
1279   FX_WCHAR curChar = wstrItem.GetAt(0);
1280   if (WritingMode == TextOrientation::Horizontal) {
1281     if (this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) {
1282       FX_FLOAT top =
1283           this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top;
1284       FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom
1285                                                             : prev_rect.bottom;
1286       if (bottom >= top) {
1287         return IsHyphen(curChar) ? GenerateCharacter::Hyphen
1288                                  : GenerateCharacter::LineBreak;
1289       }
1290     }
1291   } else if (WritingMode == TextOrientation::Vertical) {
1292     if (this_rect.Width() > pObj->GetFontSize() * 0.1f &&
1293         prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) {
1294       FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left
1295                                                           : m_CurlineRect.left;
1296       FX_FLOAT right = this_rect.right < m_CurlineRect.right
1297                            ? this_rect.right
1298                            : m_CurlineRect.right;
1299       if (right <= left) {
1300         return IsHyphen(curChar) ? GenerateCharacter::Hyphen
1301                                  : GenerateCharacter::LineBreak;
1302       }
1303     }
1304   }
1305 
1306   FX_FLOAT last_pos = PrevItem.m_Origin.x;
1307   int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont());
1308   FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000;
1309   last_width = FXSYS_fabs(last_width);
1310   int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
1311   FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
1312   this_width = FXSYS_fabs(this_width);
1313   FX_FLOAT threshold =
1314       last_width > this_width ? last_width / 4 : this_width / 4;
1315 
1316   CFX_Matrix prev_matrix = m_pPreTextObj->GetTextMatrix();
1317   prev_matrix.Concat(m_perMatrix);
1318 
1319   CFX_Matrix prev_reverse;
1320   prev_reverse.SetReverse(prev_matrix);
1321 
1322   CFX_PointF pos = prev_reverse.Transform(formMatrix.Transform(pObj->GetPos()));
1323   if (last_width < this_width)
1324     threshold = prev_reverse.TransformDistance(threshold);
1325 
1326   bool bNewline = false;
1327   if (WritingMode == TextOrientation::Horizontal) {
1328     CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom,
1329                         m_pPreTextObj->m_Right, pObj->m_Top);
1330     CFX_FloatRect rect2 = m_pPreTextObj->GetRect();
1331     CFX_FloatRect rect3 = rect1;
1332     rect1.Intersect(rect2);
1333     if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) ||
1334         ((pos.y > threshold * 2 || pos.y < threshold * -3) &&
1335          (FXSYS_fabs(pos.y) < 1 ? FXSYS_fabs(pos.x) < FXSYS_fabs(pos.y)
1336                                 : true))) {
1337       bNewline = true;
1338       if (nItem > 1) {
1339         CPDF_TextObjectItem tempItem;
1340         m_pPreTextObj->GetItemInfo(0, &tempItem);
1341         CFX_Matrix m = m_pPreTextObj->GetTextMatrix();
1342         if (PrevItem.m_Origin.x > tempItem.m_Origin.x &&
1343             m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
1344             m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 &&
1345             m.c < 0.1) {
1346           CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000,
1347                            m_pPreTextObj->m_Top);
1348           if (re.Contains(pObj->GetPos())) {
1349             bNewline = false;
1350           } else {
1351             CFX_FloatRect rect(0, pObj->m_Bottom, 1000, pObj->m_Top);
1352             if (rect.Contains(m_pPreTextObj->GetPos()))
1353               bNewline = false;
1354           }
1355         }
1356       }
1357     }
1358   }
1359   if (bNewline) {
1360     return IsHyphen(curChar) ? GenerateCharacter::Hyphen
1361                              : GenerateCharacter::LineBreak;
1362   }
1363 
1364   int32_t nChars = pObj->CountChars();
1365   if (nChars == 1 && (0x2D == curChar || 0xAD == curChar) &&
1366       IsHyphen(curChar)) {
1367     return GenerateCharacter::Hyphen;
1368   }
1369   CFX_WideString PrevStr =
1370       m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode);
1371   FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1);
1372   CFX_Matrix matrix = pObj->GetTextMatrix();
1373   matrix.Concat(formMatrix);
1374 
1375   threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
1376   threshold = threshold > 400
1377                   ? (threshold < 700
1378                          ? threshold / 4
1379                          : (threshold > 800 ? threshold / 6 : threshold / 5))
1380                   : (threshold / 2);
1381   if (nLastWidth >= nThisWidth) {
1382     threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize());
1383   } else {
1384     threshold *= FXSYS_fabs(pObj->GetFontSize());
1385     threshold = matrix.TransformDistance(threshold);
1386     threshold = prev_reverse.TransformDistance(threshold);
1387   }
1388   threshold /= 1000;
1389   if ((threshold < 1.4881 && threshold > 1.4879) ||
1390       (threshold < 1.39001 && threshold > 1.38999)) {
1391     threshold *= 1.5;
1392   }
1393   if (FXSYS_fabs(last_pos + last_width - pos.x) > threshold &&
1394       curChar != L' ' && preChar != L' ') {
1395     if (curChar != L' ' && preChar != L' ') {
1396       if ((pos.x - last_pos - last_width) > threshold ||
1397           (last_pos - pos.x - last_width) > threshold) {
1398         return GenerateCharacter::Space;
1399       }
1400       if (pos.x < 0 && (last_pos - pos.x - last_width) > threshold)
1401         return GenerateCharacter::Space;
1402       if ((pos.x - last_pos - last_width) > this_width ||
1403           (pos.x - last_pos - this_width) > last_width) {
1404         return GenerateCharacter::Space;
1405       }
1406     }
1407   }
1408   return GenerateCharacter::None;
1409 }
1410 
IsSameTextObject(CPDF_TextObject * pTextObj1,CPDF_TextObject * pTextObj2)1411 bool CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
1412                                      CPDF_TextObject* pTextObj2) {
1413   if (!pTextObj1 || !pTextObj2)
1414     return false;
1415 
1416   CFX_FloatRect rcPreObj = pTextObj2->GetRect();
1417   CFX_FloatRect rcCurObj = pTextObj1->GetRect();
1418   if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
1419     FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left);
1420     size_t nCount = m_CharList.size();
1421     if (nCount >= 2) {
1422       PAGECHAR_INFO perCharTemp = m_CharList[nCount - 2];
1423       FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width();
1424       if (dbXdif > dbSpace)
1425         return false;
1426     }
1427   }
1428   if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
1429     rcPreObj.Intersect(rcCurObj);
1430     if (rcPreObj.IsEmpty())
1431       return false;
1432     if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
1433         rcCurObj.Width() / 2) {
1434       return false;
1435     }
1436     if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize())
1437       return false;
1438   }
1439   int nPreCount = pTextObj2->CountItems();
1440   int nCurCount = pTextObj1->CountItems();
1441   if (nPreCount != nCurCount)
1442     return false;
1443   // If both objects have no items, consider them same.
1444   if (!nPreCount)
1445     return true;
1446 
1447   CPDF_TextObjectItem itemPer;
1448   CPDF_TextObjectItem itemCur;
1449   for (int i = 0; i < nPreCount; i++) {
1450     pTextObj2->GetItemInfo(i, &itemPer);
1451     pTextObj1->GetItemInfo(i, &itemCur);
1452     if (itemCur.m_CharCode != itemPer.m_CharCode)
1453       return false;
1454   }
1455 
1456   CFX_PointF diff = pTextObj1->GetPos() - pTextObj2->GetPos();
1457   FX_FLOAT font_size = pTextObj2->GetFontSize();
1458   FX_FLOAT char_size = GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont());
1459   FX_FLOAT max_pre_size =
1460       std::max(std::max(rcPreObj.Height(), rcPreObj.Width()), font_size);
1461   if (FXSYS_fabs(diff.x) > char_size * font_size / 1000 * 0.9 ||
1462       FXSYS_fabs(diff.y) > max_pre_size / 8) {
1463     return false;
1464   }
1465   return true;
1466 }
1467 
IsSameAsPreTextObject(CPDF_TextObject * pTextObj,const CPDF_PageObjectList * pObjList,CPDF_PageObjectList::const_iterator iter)1468 bool CPDF_TextPage::IsSameAsPreTextObject(
1469     CPDF_TextObject* pTextObj,
1470     const CPDF_PageObjectList* pObjList,
1471     CPDF_PageObjectList::const_iterator iter) {
1472   int i = 0;
1473   while (i < 5 && iter != pObjList->begin()) {
1474     --iter;
1475     CPDF_PageObject* pOtherObj = iter->get();
1476     if (pOtherObj == pTextObj || !pOtherObj->IsText())
1477       continue;
1478     if (IsSameTextObject(pOtherObj->AsText(), pTextObj))
1479       return true;
1480     ++i;
1481   }
1482   return false;
1483 }
1484 
GenerateCharInfo(FX_WCHAR unicode,PAGECHAR_INFO & info)1485 bool CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) {
1486   const PAGECHAR_INFO* preChar;
1487   if (!m_TempCharList.empty())
1488     preChar = &m_TempCharList.back();
1489   else if (!m_CharList.empty())
1490     preChar = &m_CharList.back();
1491   else
1492     return false;
1493 
1494   info.m_Index = m_TextBuf.GetLength();
1495   info.m_Unicode = unicode;
1496   info.m_pTextObj = nullptr;
1497   info.m_CharCode = CPDF_Font::kInvalidCharCode;
1498   info.m_Flag = FPDFTEXT_CHAR_GENERATED;
1499 
1500   int preWidth = 0;
1501   if (preChar->m_pTextObj && preChar->m_CharCode != -1) {
1502     preWidth =
1503         GetCharWidth(preChar->m_CharCode, preChar->m_pTextObj->GetFont());
1504   }
1505 
1506   FX_FLOAT fFontSize = preChar->m_pTextObj ? preChar->m_pTextObj->GetFontSize()
1507                                            : preChar->m_CharBox.Height();
1508   if (!fFontSize)
1509     fFontSize = kDefaultFontSize;
1510 
1511   info.m_Origin = CFX_PointF(
1512       preChar->m_Origin.x + preWidth * (fFontSize) / 1000, preChar->m_Origin.y);
1513   info.m_CharBox = CFX_FloatRect(info.m_Origin.x, info.m_Origin.y,
1514                                  info.m_Origin.x, info.m_Origin.y);
1515   return true;
1516 }
1517 
IsRectIntersect(const CFX_FloatRect & rect1,const CFX_FloatRect & rect2)1518 bool CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,
1519                                     const CFX_FloatRect& rect2) {
1520   CFX_FloatRect rect = rect1;
1521   rect.Intersect(rect2);
1522   return !rect.IsEmpty();
1523 }
1524