1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
8 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
9 
10 #include <deque>
11 #include <vector>
12 
13 #include "core/fpdfapi/page/cpdf_pageobjectlist.h"
14 #include "core/fxcrt/fx_basic.h"
15 #include "core/fxcrt/fx_coordinates.h"
16 #include "core/fxcrt/fx_string.h"
17 
18 class CPDF_Font;
19 class CPDF_FormObject;
20 class CPDF_Page;
21 class CPDF_TextObject;
22 
23 #define FPDFTEXT_MATCHCASE 0x00000001
24 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002
25 #define FPDFTEXT_CONSECUTIVE 0x00000004
26 
27 #define FPDFTEXT_CHAR_ERROR -1
28 #define FPDFTEXT_CHAR_NORMAL 0
29 #define FPDFTEXT_CHAR_GENERATED 1
30 #define FPDFTEXT_CHAR_UNUNICODE 2
31 #define FPDFTEXT_CHAR_HYPHEN 3
32 #define FPDFTEXT_CHAR_PIECE 4
33 
34 #define TEXT_SPACE_CHAR L' '
35 #define TEXT_LINEFEED_CHAR L'\n'
36 #define TEXT_RETURN_CHAR L'\r'
37 #define TEXT_EMPTY L""
38 #define TEXT_SPACE L" "
39 #define TEXT_RETURN_LINEFEED L"\r\n"
40 #define TEXT_LINEFEED L"\n"
41 #define TEXT_CHARRATIO_GAPDELTA 0.070
42 
43 enum class FPDFText_MarkedContent { Pass = 0, Done, Delay };
44 
45 enum class FPDFText_Direction { Left = -1, Right = 1 };
46 
47 class FPDF_CHAR_INFO {
48  public:
49   FPDF_CHAR_INFO();
50   ~FPDF_CHAR_INFO();
51 
52   FX_WCHAR m_Unicode;
53   FX_WCHAR m_Charcode;
54   int32_t m_Flag;
55   FX_FLOAT m_FontSize;
56   CFX_PointF m_Origin;
57   CFX_FloatRect m_CharBox;
58   CPDF_TextObject* m_pTextObj;
59   CFX_Matrix m_Matrix;
60 };
61 
62 struct FPDF_SEGMENT {
63   int m_Start;
64   int m_nCount;
65 };
66 
67 class PAGECHAR_INFO {
68  public:
69   PAGECHAR_INFO();
70   PAGECHAR_INFO(const PAGECHAR_INFO&);
71   ~PAGECHAR_INFO();
72 
73   int m_Index;
74   int m_CharCode;
75   FX_WCHAR m_Unicode;
76   int32_t m_Flag;
77   CFX_PointF m_Origin;
78   CFX_FloatRect m_CharBox;
79   CPDF_TextObject* m_pTextObj;
80   CFX_Matrix m_Matrix;
81 };
82 
83 struct PDFTEXT_Obj {
84   CPDF_TextObject* m_pTextObj;
85   CFX_Matrix m_formMatrix;
86 };
87 
88 class CPDF_TextPage {
89  public:
90   CPDF_TextPage(const CPDF_Page* pPage, FPDFText_Direction flags);
91   ~CPDF_TextPage();
92 
93   // IPDF_TextPage:
94   void ParseTextPage();
IsParsed()95   bool IsParsed() const { return m_bIsParsed; }
96   int CharIndexFromTextIndex(int TextIndex) const;
97   int TextIndexFromCharIndex(int CharIndex) const;
98   int CountChars() const;
99   void GetCharInfo(int index, FPDF_CHAR_INFO* info) const;
100   std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const;
101   int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
102   CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const;
103   CFX_WideString GetPageText(int start = 0, int nCount = -1) const;
104   int CountRects(int start, int nCount);
105   void GetRect(int rectIndex,
106                FX_FLOAT& left,
107                FX_FLOAT& top,
108                FX_FLOAT& right,
109                FX_FLOAT& bottom) const;
110 
111   static bool IsRectIntersect(const CFX_FloatRect& rect1,
112                               const CFX_FloatRect& rect2);
113 
114  private:
115   enum class TextOrientation {
116     Unknown,
117     Horizontal,
118     Vertical,
119   };
120 
121   enum class GenerateCharacter {
122     None,
123     Space,
124     LineBreak,
125     Hyphen,
126   };
127 
128   bool IsHyphen(FX_WCHAR curChar);
129   bool IsControlChar(const PAGECHAR_INFO& charInfo);
130   void ProcessObject();
131   void ProcessFormObject(CPDF_FormObject* pFormObj,
132                          const CFX_Matrix& formMatrix);
133   void ProcessTextObject(PDFTEXT_Obj pObj);
134   void ProcessTextObject(CPDF_TextObject* pTextObj,
135                          const CFX_Matrix& formMatrix,
136                          const CPDF_PageObjectList* pObjList,
137                          CPDF_PageObjectList::const_iterator ObjPos);
138   GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj,
139                                         const CFX_Matrix& formMatrix);
140   bool GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info);
141   bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
142                              const CPDF_PageObjectList* pObjList,
143                              CPDF_PageObjectList::const_iterator ObjPos);
144   bool IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2);
145   int GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const;
146   void CloseTempLine();
147   FPDFText_MarkedContent PreMarkedContent(PDFTEXT_Obj pObj);
148   void ProcessMarkedContent(PDFTEXT_Obj pObj);
149   void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const;
150   void FindPreviousTextObject();
151   void AddCharInfoByLRDirection(FX_WCHAR wChar, PAGECHAR_INFO info);
152   void AddCharInfoByRLDirection(FX_WCHAR wChar, PAGECHAR_INFO info);
153   TextOrientation GetTextObjectWritingMode(
154       const CPDF_TextObject* pTextObj) const;
155   TextOrientation FindTextlineFlowOrientation() const;
156   void AppendGeneratedCharacter(FX_WCHAR unicode, const CFX_Matrix& formMatrix);
157 
158   void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
159   bool IsRightToLeft(const CPDF_TextObject* pTextObj,
160                      const CPDF_Font* pFont,
161                      int nItems) const;
162 
163   const CPDF_Page* const m_pPage;
164   std::vector<uint16_t> m_CharIndex;
165   std::deque<PAGECHAR_INFO> m_CharList;
166   std::deque<PAGECHAR_INFO> m_TempCharList;
167   CFX_WideTextBuf m_TextBuf;
168   CFX_WideTextBuf m_TempTextBuf;
169   const FPDFText_Direction m_parserflag;
170   CPDF_TextObject* m_pPreTextObj;
171   CFX_Matrix m_perMatrix;
172   bool m_bIsParsed;
173   CFX_Matrix m_DisplayMatrix;
174   std::vector<CFX_FloatRect> m_SelRects;
175   std::vector<PDFTEXT_Obj> m_LineObj;
176   TextOrientation m_TextlineDir;
177   CFX_FloatRect m_CurlineRect;
178 };
179 
180 #endif  // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
181