1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
8 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
9 
10 #include <deque>
11 #include <vector>
12 
13 #include "core/fpdfapi/page/cpdf_pageobjectlist.h"
14 #include "core/fxcrt/cfx_widetextbuf.h"
15 #include "core/fxcrt/fx_coordinates.h"
16 #include "core/fxcrt/fx_string.h"
17 #include "core/fxcrt/unowned_ptr.h"
18 
19 class CPDF_Font;
20 class CPDF_FormObject;
21 class CPDF_Page;
22 class CPDF_TextObject;
23 
24 #define FPDFTEXT_MATCHCASE 0x00000001
25 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002
26 #define FPDFTEXT_CONSECUTIVE 0x00000004
27 
28 #define FPDFTEXT_CHAR_NORMAL 0
29 #define FPDFTEXT_CHAR_GENERATED 1
30 #define FPDFTEXT_CHAR_UNUNICODE 2
31 #define FPDFTEXT_CHAR_HYPHEN 3
32 #define FPDFTEXT_CHAR_PIECE 4
33 
34 #define TEXT_SPACE_CHAR L' '
35 #define TEXT_LINEFEED_CHAR L'\n'
36 #define TEXT_RETURN_CHAR L'\r'
37 #define TEXT_HYPHEN_CHAR L'-'
38 #define TEXT_EMPTY L""
39 #define TEXT_HYPHEN L"-"
40 #define TEXT_CHARRATIO_GAPDELTA 0.070
41 
42 enum class FPDFText_MarkedContent { Pass = 0, Done, Delay };
43 
44 enum class FPDFText_Direction { Left = -1, Right = 1 };
45 
46 class FPDF_CHAR_INFO {
47  public:
48   FPDF_CHAR_INFO();
49   ~FPDF_CHAR_INFO();
50 
51   wchar_t m_Unicode;
52   wchar_t m_Charcode;
53   int32_t m_Flag;
54   float m_FontSize;
55   CFX_PointF m_Origin;
56   CFX_FloatRect m_CharBox;
57   UnownedPtr<CPDF_TextObject> m_pTextObj;
58   CFX_Matrix m_Matrix;
59 };
60 
61 struct FPDF_SEGMENT {
62   int m_Start;
63   int m_nCount;
64 };
65 
66 class PAGECHAR_INFO {
67  public:
68   PAGECHAR_INFO();
69   PAGECHAR_INFO(const PAGECHAR_INFO&);
70   ~PAGECHAR_INFO();
71 
72   int m_Index;
73   int m_CharCode;
74   wchar_t m_Unicode;
75   int32_t m_Flag;
76   CFX_PointF m_Origin;
77   CFX_FloatRect m_CharBox;
78   UnownedPtr<CPDF_TextObject> m_pTextObj;
79   CFX_Matrix m_Matrix;
80 };
81 
82 struct PDFTEXT_Obj {
83   PDFTEXT_Obj();
84   PDFTEXT_Obj(const PDFTEXT_Obj& that);
85   ~PDFTEXT_Obj();
86 
87   UnownedPtr<CPDF_TextObject> m_pTextObj;
88   CFX_Matrix m_formMatrix;
89 };
90 
91 class CPDF_TextPage {
92  public:
93   CPDF_TextPage(const CPDF_Page* pPage, FPDFText_Direction flags);
94   ~CPDF_TextPage();
95 
96   // IPDF_TextPage:
97   void ParseTextPage();
IsParsed()98   bool IsParsed() const { return m_bIsParsed; }
99   int CharIndexFromTextIndex(int TextIndex) const;
100   int TextIndexFromCharIndex(int CharIndex) const;
101   int CountChars() const;
102   void GetCharInfo(int index, FPDF_CHAR_INFO* info) const;
103   std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const;
104   int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
105   WideString GetTextByRect(const CFX_FloatRect& rect) const;
106 
107   // Returns string with the text from |m_TextBuf| that are covered by the input
108   // range. |start| and |count| are in terms of the m_CharIndex, so the range
109   // will be converted into appropriate indices.
110   WideString GetPageText(int start, int count) const;
GetAllPageText()111   WideString GetAllPageText() const { return GetPageText(0, CountChars()); }
112 
113   int CountRects(int start, int nCount);
114   bool GetRect(int rectIndex, CFX_FloatRect* pRect) const;
115 
116   static bool IsRectIntersect(const CFX_FloatRect& rect1,
117                               const CFX_FloatRect& rect2);
118 
119  private:
120   enum class TextOrientation {
121     Unknown,
122     Horizontal,
123     Vertical,
124   };
125 
126   enum class GenerateCharacter {
127     None,
128     Space,
129     LineBreak,
130     Hyphen,
131   };
132 
133   bool IsHyphen(wchar_t curChar) const;
134   bool IsControlChar(const PAGECHAR_INFO& charInfo);
135   void ProcessObject();
136   void ProcessFormObject(CPDF_FormObject* pFormObj,
137                          const CFX_Matrix& formMatrix);
138   void ProcessTextObject(PDFTEXT_Obj pObj);
139   void ProcessTextObject(CPDF_TextObject* pTextObj,
140                          const CFX_Matrix& formMatrix,
141                          const CPDF_PageObjectList* pObjList,
142                          CPDF_PageObjectList::const_iterator ObjPos);
143   GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj,
144                                         const CFX_Matrix& formMatrix);
145   bool GenerateCharInfo(wchar_t unicode, PAGECHAR_INFO& info);
146   bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
147                              const CPDF_PageObjectList* pObjList,
148                              CPDF_PageObjectList::const_iterator ObjPos);
149   bool IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2);
150   int GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const;
151   void CloseTempLine();
152   FPDFText_MarkedContent PreMarkedContent(PDFTEXT_Obj pObj);
153   void ProcessMarkedContent(PDFTEXT_Obj pObj);
154   void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const;
155   void FindPreviousTextObject();
156   void AddCharInfoByLRDirection(wchar_t wChar, PAGECHAR_INFO info);
157   void AddCharInfoByRLDirection(wchar_t wChar, PAGECHAR_INFO info);
158   TextOrientation GetTextObjectWritingMode(
159       const CPDF_TextObject* pTextObj) const;
160   TextOrientation FindTextlineFlowOrientation() const;
161   void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix);
162 
163   void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
164   bool IsRightToLeft(const CPDF_TextObject* pTextObj,
165                      const CPDF_Font* pFont,
166                      size_t nItems) const;
167 
168   UnownedPtr<const CPDF_Page> const m_pPage;
169   std::vector<uint16_t> m_CharIndex;
170   std::deque<PAGECHAR_INFO> m_CharList;
171   std::deque<PAGECHAR_INFO> m_TempCharList;
172   CFX_WideTextBuf m_TextBuf;
173   CFX_WideTextBuf m_TempTextBuf;
174   const FPDFText_Direction m_parserflag;
175   UnownedPtr<CPDF_TextObject> m_pPreTextObj;
176   CFX_Matrix m_perMatrix;
177   bool m_bIsParsed;
178   CFX_Matrix m_DisplayMatrix;
179   std::vector<CFX_FloatRect> m_SelRects;
180   std::vector<PDFTEXT_Obj> m_LineObj;
181   TextOrientation m_TextlineDir;
182   CFX_FloatRect m_CurlineRect;
183 };
184 
185 #endif  // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
186