1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef CORE_SRC_FPDFTEXT_TEXT_INT_H_
8 #define CORE_SRC_FPDFTEXT_TEXT_INT_H_
9 
10 class CPDF_TextParseOptions
11 {
12 public:
13     CPDF_TextParseOptions();
14     FX_BOOL			m_bCheckObjectOrder;
15     FX_BOOL			m_bCheckDirection;
16     int				m_nCheckSameObject;
17 };
18 class CPDF_TextPage;
19 class CPDF_LinkExtract;
20 class CPDF_TextPageFind;
21 class CPDF_DocProgressiveSearch;
22 #define FPDFTEXT_CHAR_ERROR			-1
23 #define FPDFTEXT_CHAR_NORMAL		0
24 #define FPDFTEXT_CHAR_GENERATED		1
25 #define FPDFTEXT_CHAR_UNUNICODE		2
26 #define FPDFTEXT_CHAR_HYPHEN		3
27 #define FPDFTEXT_CHAR_PIECE			4
28 #define FPDFTEXT_MC_PASS			0
29 #define FPDFTEXT_MC_DONE			1
30 #define FPDFTEXT_MC_DELAY			2
31 typedef struct _PAGECHAR_INFO {
32     int					m_CharCode;
33     FX_WCHAR			m_Unicode;
34     FX_FLOAT			m_OriginX;
35     FX_FLOAT			m_OriginY;
36     FX_INT32			m_Flag;
37     CFX_FloatRect		m_CharBox;
38     CPDF_TextObject*	m_pTextObj;
39     CFX_AffineMatrix	m_Matrix;
40     int					m_Index;
41 } PAGECHAR_INFO;
42 typedef	CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray;
43 typedef struct {
44     int	m_Start;
45     int m_nCount;
46 } FPDF_SEGMENT;
47 typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array;
48 typedef struct {
49     CPDF_TextObject*	m_pTextObj;
50     CFX_AffineMatrix	m_formMatrix;
51 } PDFTEXT_Obj;
52 typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ;
53 class CPDF_TextPage: public IPDF_TextPage
54 {
55 public:
56     CPDF_TextPage(const CPDF_Page* pPage, int flags = 0);
57     CPDF_TextPage(const CPDF_PageObjects* pPage, int flags = 0);
58     CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions);
59     virtual FX_BOOL					ParseTextPage();
60     virtual void					NormalizeObjects(FX_BOOL bNormalize);
IsParsered()61     virtual	FX_BOOL					IsParsered() const
62     {
63         return m_IsParsered;
64     }
~CPDF_TextPage()65     virtual ~CPDF_TextPage() {};
66 public:
67     virtual int CharIndexFromTextIndex(int TextIndex)const ;
68     virtual int TextIndexFromCharIndex(int CharIndex)const;
69     virtual int						CountChars() const;
70     virtual	void					GetCharInfo(int index, FPDF_CHAR_INFO & info) const;
71     virtual void					GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const;
72     virtual int						GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const;
73     virtual int						GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance,
74             FX_FLOAT yTorelance) const;
75     virtual CFX_WideString			GetTextByRect(const CFX_FloatRect& rect) const;
76     virtual void					GetRectsArrayByRect(const CFX_FloatRect& rect, CFX_RectArray& resRectArray) const;
77     virtual	int						GetOrderByDirection(int order, int direction) const;
78     virtual	CFX_WideString			GetPageText(int start = 0, int nCount = -1) const;
79 
80     virtual int						CountRects(int start, int nCount);
81     virtual	void					GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top
82                                             , FX_FLOAT& right, FX_FLOAT &bottom) const;
83     virtual FX_BOOL					GetBaselineRotate(int rectIndex, int& Rotate);
84     virtual FX_BOOL					GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate);
85     virtual	int						CountBoundedSegments(FX_FLOAT left, FX_FLOAT top,
86             FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE);
87     virtual	void					GetBoundedSegment(int index, int& start, int& count) const;
88     virtual int						GetWordBreak(int index, int direction) const;
89 public:
GetCharList()90     const	PAGECHAR_InfoArray*		GetCharList() const
91     {
92         return &m_charList;
93     }
94     static	FX_BOOL					IsRectIntersect(const CFX_FloatRect& rect1, const CFX_FloatRect& rect2);
95     static	FX_BOOL					IsLetter(FX_WCHAR unicode);
96 private:
97     FX_BOOL							IsHyphen(FX_WCHAR curChar);
98     FX_BOOL							IsControlChar(PAGECHAR_INFO* pCharInfo);
99     FX_BOOL							GetBaselineRotate(int start, int end, int& Rotate);
100     void							ProcessObject();
101     void							ProcessFormObject(CPDF_FormObject*	pFormObj, const CFX_AffineMatrix& formMatrix);
102     void							ProcessTextObject(PDFTEXT_Obj pObj);
103     void							ProcessTextObject(CPDF_TextObject*	pTextObj, const CFX_AffineMatrix& formMatrix, FX_POSITION ObjPos);
104     int								ProcessInsertObject(const CPDF_TextObject* pObj, const CFX_AffineMatrix& formMatrix);
105     FX_BOOL							GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info);
106     FX_BOOL							IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos);
107     FX_BOOL							IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2);
108     int								GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const;
109     void							CloseTempLine();
110     void							OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str);
111     FX_INT32	PreMarkedContent(PDFTEXT_Obj pObj);
112     void		ProcessMarkedContent(PDFTEXT_Obj pObj);
113     void		CheckMarkedContentObject(FX_INT32& start, FX_INT32& nCount) const;
114     void		FindPreviousTextObject(void);
115     void		AddCharInfoByLRDirection(CFX_WideString& str, int i);
116     void		AddCharInfoByRLDirection(CFX_WideString& str, int i);
117     FX_INT32	GetTextObjectWritingMode(const CPDF_TextObject* pTextObj);
118     FX_INT32	FindTextlineFlowDirection();
119     void SwapTempTextBuf(FX_INT32 iCharListStartAppend,
120                          FX_INT32 iBufStartAppend);
121     FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj,
122                           const CPDF_Font* pFont,
123                           int nItems) const;
124 protected:
125     CPDFText_ParseOptions			m_ParseOptions;
126     CFX_WordArray					m_CharIndex;
127     const CPDF_PageObjects*			m_pPage;
128     PAGECHAR_InfoArray				m_charList;
129     CFX_WideTextBuf					m_TextBuf;
130     PAGECHAR_InfoArray				m_TempCharList;
131     CFX_WideTextBuf					m_TempTextBuf;
132     int								m_parserflag;
133     CPDF_TextObject*				m_pPreTextObj;
134     CFX_AffineMatrix				m_perMatrix;
135     FX_BOOL							m_IsParsered;
136     CFX_AffineMatrix				m_DisplayMatrix;
137 
138     SEGMENT_Array					m_Segment;
139     CFX_RectArray					m_SelRects;
140     LINEOBJ							m_LineObj;
141     FX_BOOL							m_TextlineDir;
142     CFX_FloatRect					m_CurlineRect;
143 };
144 class CPDF_TextPageFind: public IPDF_TextPageFind
145 {
146 public:
147     CPDF_TextPageFind(const IPDF_TextPage* pTextPage);
~CPDF_TextPageFind()148     virtual							~CPDF_TextPageFind() {};
149 public:
150     virtual	FX_BOOL					FindFirst(const CFX_WideString& findwhat, int flags, int startPos = 0);
151     virtual	FX_BOOL					FindNext();
152     virtual	FX_BOOL					FindPrev();
153 
154     virtual void					GetRectArray(CFX_RectArray& rects) const;
155     virtual int						GetCurOrder() const;
156     virtual int						GetMatchedCount()const;
157 protected:
158     void							ExtractFindWhat(const CFX_WideString& findwhat);
159     FX_BOOL							IsMatchWholeWord(const CFX_WideString& csPageText, int startPos, int endPos);
160     FX_BOOL							ExtractSubString(CFX_WideString& rString, FX_LPCWSTR lpszFullString,
161             int iSubString, FX_WCHAR chSep);
162     CFX_WideString					MakeReverse(const CFX_WideString& str);
163     int								ReverseFind(const CFX_WideString& csPageText, const CFX_WideString& csWord, int nStartPos, int& WordLength);
164     int								GetCharIndex(int index) const;
165 private:
166     CFX_WordArray					m_CharIndex;
167     const IPDF_TextPage*			m_pTextPage;
168     CFX_WideString					m_strText;
169     CFX_WideString					m_findWhat;
170     int								m_flags;
171     CFX_WideStringArray				m_csFindWhatArray;
172     int								m_findNextStart;
173     int								m_findPreStart;
174     FX_BOOL							m_bMatchCase;
175     FX_BOOL							m_bMatchWholeWord;
176     int								m_resStart;
177     int								m_resEnd;
178     CFX_RectArray					m_resArray;
179     FX_BOOL							m_IsFind;
180 };
181 class CPDF_LinkExt
182 {
183 public:
CPDF_LinkExt()184     CPDF_LinkExt() {};
185     int								m_Start;
186     int								m_Count;
187     CFX_WideString					m_strUrl;
~CPDF_LinkExt()188     virtual							~CPDF_LinkExt() {};
189 };
190 typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray;
191 class CPDF_LinkExtract: public IPDF_LinkExtract
192 {
193 public:
194     CPDF_LinkExtract();
195     virtual							~CPDF_LinkExtract();
196     virtual FX_BOOL					ExtractLinks(const IPDF_TextPage* pTextPage);
IsExtract()197     virtual	FX_BOOL					IsExtract() const
198     {
199         return m_IsParserd;
200     }
201 public:
202     virtual int						CountLinks() const;
203     virtual	CFX_WideString			GetURL(int index) const;
204     virtual	void					GetBoundedSegment(int index, int& start, int& count) const;
205     virtual	void					GetRects(int index, CFX_RectArray& rects)const;
206 protected:
207     void							parserLink();
208     void							DeleteLinkList();
209     FX_BOOL							CheckWebLink(CFX_WideString& strBeCheck);
210     FX_BOOL							CheckMailLink(CFX_WideString& str);
211     FX_BOOL							AppendToLinkList(int start, int count, const CFX_WideString& strUrl);
212 private:
213     LINK_InfoArray					m_LinkList;
214     const CPDF_TextPage*			m_pTextPage;
215     CFX_WideString					m_strPageText;
216     FX_BOOL							m_IsParserd;
217 };
218 FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_LPWSTR pDst);
219 void NormalizeString(CFX_WideString& str);
220 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest);
221 
222 #endif  // CORE_SRC_FPDFTEXT_TEXT_INT_H_
223