1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
8 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
9 
10 #include <deque>
11 #include <functional>
12 #include <vector>
13 
14 #include "core/fpdfapi/page/cpdf_pageobjectholder.h"
15 #include "core/fxcrt/cfx_widetextbuf.h"
16 #include "core/fxcrt/fx_coordinates.h"
17 #include "core/fxcrt/fx_string.h"
18 #include "core/fxcrt/unowned_ptr.h"
19 #include "third_party/base/optional.h"
20 
21 class CPDF_Font;
22 class CPDF_FormObject;
23 class CPDF_Page;
24 class CPDF_TextObject;
25 
26 struct PDFTEXT_Obj {
27   PDFTEXT_Obj();
28   PDFTEXT_Obj(const PDFTEXT_Obj& that);
29   ~PDFTEXT_Obj();
30 
31   UnownedPtr<CPDF_TextObject> m_pTextObj;
32   CFX_Matrix m_formMatrix;
33 };
34 
35 class CPDF_TextPage {
36  public:
37   enum class CharType : uint8_t {
38     kNormal,
39     kGenerated,
40     kNotUnicode,
41     kHyphen,
42     kPiece,
43   };
44 
45   class CharInfo {
46    public:
47     CharInfo();
48     CharInfo(const CharInfo&);
49     ~CharInfo();
50 
51     int m_Index = 0;
52     uint32_t m_CharCode = 0;
53     wchar_t m_Unicode = 0;
54     CharType m_CharType = CharType::kNormal;
55     CFX_PointF m_Origin;
56     CFX_FloatRect m_CharBox;
57     UnownedPtr<CPDF_TextObject> m_pTextObj;
58     CFX_Matrix m_Matrix;
59   };
60 
61   CPDF_TextPage(const CPDF_Page* pPage, bool rtl);
62   ~CPDF_TextPage();
63 
64   int CharIndexFromTextIndex(int text_index) const;
65   int TextIndexFromCharIndex(int char_index) const;
size()66   size_t size() const { return m_CharList.size(); }
67   int CountChars() const;
68 
69   // These methods CHECK() to make sure |index| is within bounds.
70   const CharInfo& GetCharInfo(size_t index) const;
71   float GetCharFontSize(size_t index) const;
72 
73   std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const;
74   int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
75   WideString GetTextByRect(const CFX_FloatRect& rect) const;
76   WideString GetTextByObject(const CPDF_TextObject* pTextObj) const;
77 
78   // Returns string with the text from |m_TextBuf| that are covered by the input
79   // range. |start| and |count| are in terms of the |m_CharIndices|, so the
80   // range will be converted into appropriate indices.
81   WideString GetPageText(int start, int count) const;
GetAllPageText()82   WideString GetAllPageText() const { return GetPageText(0, CountChars()); }
83 
84   int CountRects(int start, int nCount);
85   bool GetRect(int rectIndex, CFX_FloatRect* pRect) const;
86 
87  private:
88   enum class TextOrientation {
89     kUnknown,
90     kHorizontal,
91     kVertical,
92   };
93 
94   enum class GenerateCharacter {
95     kNone,
96     kSpace,
97     kLineBreak,
98     kHyphen,
99   };
100 
101   enum class MarkedContentState { kPass = 0, kDone, kDelay };
102 
103   void Init();
104   bool IsHyphen(wchar_t curChar) const;
105   void ProcessObject();
106   void ProcessFormObject(CPDF_FormObject* pFormObj,
107                          const CFX_Matrix& formMatrix);
108   void ProcessTextObject(PDFTEXT_Obj pObj);
109   void ProcessTextObject(CPDF_TextObject* pTextObj,
110                          const CFX_Matrix& formMatrix,
111                          const CPDF_PageObjectHolder* pObjList,
112                          CPDF_PageObjectHolder::const_iterator ObjPos);
113   GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj,
114                                         const CFX_Matrix& formMatrix);
115   const CharInfo* GetPrevCharInfo() const;
116   Optional<CharInfo> GenerateCharInfo(wchar_t unicode);
117   bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
118                              const CPDF_PageObjectHolder* pObjList,
119                              CPDF_PageObjectHolder::const_iterator iter) const;
120   bool IsSameTextObject(CPDF_TextObject* pTextObj1,
121                         CPDF_TextObject* pTextObj2) const;
122   void CloseTempLine();
123   MarkedContentState PreMarkedContent(PDFTEXT_Obj pObj);
124   void ProcessMarkedContent(PDFTEXT_Obj pObj);
125   void FindPreviousTextObject();
126   void AddCharInfoByLRDirection(wchar_t wChar, const CharInfo& info);
127   void AddCharInfoByRLDirection(wchar_t wChar, const CharInfo& info);
128   TextOrientation GetTextObjectWritingMode(
129       const CPDF_TextObject* pTextObj) const;
130   TextOrientation FindTextlineFlowOrientation() const;
131   void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix);
132   void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
133   WideString GetTextByPredicate(
134       const std::function<bool(const CharInfo&)>& predicate) const;
135 
136   UnownedPtr<const CPDF_Page> const m_pPage;
137   std::vector<uint16_t> m_CharIndices;
138   std::deque<CharInfo> m_CharList;
139   std::deque<CharInfo> m_TempCharList;
140   CFX_WideTextBuf m_TextBuf;
141   CFX_WideTextBuf m_TempTextBuf;
142   UnownedPtr<CPDF_TextObject> m_pPrevTextObj;
143   CFX_Matrix m_PrevMatrix;
144   const bool m_rtl;
145   const CFX_Matrix m_DisplayMatrix;
146   std::vector<CFX_FloatRect> m_SelRects;
147   std::vector<PDFTEXT_Obj> m_LineObj;
148   TextOrientation m_TextlineDir = TextOrientation::kUnknown;
149   CFX_FloatRect m_CurlineRect;
150 };
151 
152 #endif  // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
153