1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_
8 #define CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_
9 
10 #include "core/include/fpdfapi/fpdf_parser.h"
11 
12 class CPDF_Page;
13 class CPDF_PageObjects;
14 class CPDF_TextObject;
15 class IPDF_LinkExtract;
16 class IPDF_ReflowedPage;
17 class IPDF_TextPage;
18 class IPDF_TextPageFind;
19 
20 #define PDF2TXT_AUTO_ROTATE 1
21 #define PDF2TXT_AUTO_WIDTH 2
22 #define PDF2TXT_KEEP_COLUMN 4
23 #define PDF2TXT_USE_OCR 8
24 #define PDF2TXT_INCLUDE_INVISIBLE 16
25 void PDF_GetPageText(CFX_ByteStringArray& lines,
26                      CPDF_Document* pDoc,
27                      CPDF_Dictionary* pPage,
28                      int iMinWidth,
29                      FX_DWORD flags);
30 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines,
31                              CPDF_Document* pDoc,
32                              CPDF_Dictionary* pPage,
33                              int iMinWidth,
34                              FX_DWORD flags);
35 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer,
36                                CPDF_Document* pDoc,
37                                CPDF_Dictionary* pPage,
38                                FX_DWORD flags);
39 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc,
40                                             CPDF_Dictionary* pPage);
41 #define CHAR_ERROR -1
42 #define CHAR_NORMAL 0
43 #define CHAR_GENERATED 1
44 #define CHAR_UNUNICODE 2
45 
46 struct FPDF_CHAR_INFO {
47   FX_WCHAR m_Unicode;
48   FX_WCHAR m_Charcode;
49   int32_t m_Flag;
50   FX_FLOAT m_FontSize;
51   FX_FLOAT m_OriginX;
52   FX_FLOAT m_OriginY;
53   CFX_FloatRect m_CharBox;
54   CPDF_TextObject* m_pTextObj;
55   CFX_Matrix m_Matrix;
56 };
57 
58 typedef CFX_ArrayTemplate<CFX_FloatRect> CFX_RectArray;
59 #define FPDFTEXT_LRTB 0
60 #define FPDFTEXT_RLTB 1
61 #define FPDFTEXT_TBRL 2
62 #define FPDFTEXT_LEFT -1
63 #define FPDFTEXT_RIGHT 1
64 #define FPDFTEXT_UP -2
65 #define FPDFTEXT_DOWN 2
66 #define FPDFTEXT_WRITINGMODE_UNKNOW 0
67 #define FPDFTEXT_WRITINGMODE_LRTB 1
68 #define FPDFTEXT_WRITINGMODE_RLTB 2
69 #define FPDFTEXT_WRITINGMODE_TBRL 3
70 class CPDFText_ParseOptions {
71  public:
72   CPDFText_ParseOptions();
73   FX_BOOL m_bGetCharCodeOnly;
74   FX_BOOL m_bNormalizeObjs;
75   FX_BOOL m_bOutputHyphen;
76 };
77 
78 class IPDF_TextPage {
79  public:
80   static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, int flags = 0);
81   static IPDF_TextPage* CreateReflowTextPage(IPDF_ReflowedPage* pRefPage);
82 
~IPDF_TextPage()83   virtual ~IPDF_TextPage() {}
84 
85   virtual void NormalizeObjects(FX_BOOL bNormalize) = 0;
86 
87   virtual FX_BOOL ParseTextPage() = 0;
88 
89   virtual bool IsParsed() const = 0;
90 
91   virtual int CharIndexFromTextIndex(int TextIndex) const = 0;
92 
93   virtual int TextIndexFromCharIndex(int CharIndex) const = 0;
94 
95   virtual int CountChars() const = 0;
96 
97   virtual void GetCharInfo(int index, FPDF_CHAR_INFO* info) const = 0;
98 
99   virtual void GetRectArray(int start,
100                             int nCount,
101                             CFX_RectArray& rectArray) const = 0;
102 
103   virtual int GetIndexAtPos(CPDF_Point point,
104                             FX_FLOAT xTolerance,
105                             FX_FLOAT yTolerance) const = 0;
106 
107   virtual int GetIndexAtPos(FX_FLOAT x,
108                             FX_FLOAT y,
109                             FX_FLOAT xTolerance,
110                             FX_FLOAT yTolerance) const = 0;
111 
112   virtual CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const = 0;
113 
114   virtual void GetRectsArrayByRect(const CFX_FloatRect& rect,
115                                    CFX_RectArray& resRectArray) const = 0;
116 
117   virtual int CountRects(int start, int nCount) = 0;
118 
119   virtual void GetRect(int rectIndex,
120                        FX_FLOAT& left,
121                        FX_FLOAT& top,
122                        FX_FLOAT& right,
123                        FX_FLOAT& bottom) const = 0;
124 
125   virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) = 0;
126 
127   virtual FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) = 0;
128 
129   virtual int CountBoundedSegments(FX_FLOAT left,
130                                    FX_FLOAT top,
131                                    FX_FLOAT right,
132                                    FX_FLOAT bottom,
133                                    FX_BOOL bContains = FALSE) = 0;
134 
135   virtual void GetBoundedSegment(int index, int& start, int& count) const = 0;
136 
137   virtual int GetWordBreak(int index, int direction) const = 0;
138 
139   virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const = 0;
140 };
141 
142 #define FPDFTEXT_MATCHCASE 0x00000001
143 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002
144 #define FPDFTEXT_CONSECUTIVE 0x00000004
145 class IPDF_TextPageFind {
146  public:
~IPDF_TextPageFind()147   virtual ~IPDF_TextPageFind() {}
148 
149   static IPDF_TextPageFind* CreatePageFind(const IPDF_TextPage* pTextPage);
150 
151  public:
152   virtual FX_BOOL FindFirst(const CFX_WideString& findwhat,
153                             int flags,
154                             int startPos = 0) = 0;
155 
156   virtual FX_BOOL FindNext() = 0;
157 
158   virtual FX_BOOL FindPrev() = 0;
159 
160   virtual void GetRectArray(CFX_RectArray& rects) const = 0;
161 
162   virtual int GetCurOrder() const = 0;
163 
164   virtual int GetMatchedCount() const = 0;
165 };
166 class IPDF_LinkExtract {
167  public:
~IPDF_LinkExtract()168   virtual ~IPDF_LinkExtract() {}
169 
170   static IPDF_LinkExtract* CreateLinkExtract();
171 
172   virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) = 0;
173 
174  public:
175   virtual int CountLinks() const = 0;
176 
177   virtual CFX_WideString GetURL(int index) const = 0;
178 
179   virtual void GetBoundedSegment(int index, int& start, int& count) const = 0;
180 
181   virtual void GetRects(int index, CFX_RectArray& rects) const = 0;
182 };
183 
184 #endif  // CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_
185