1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef CORE_SRC_FPDFTEXT_TEXT_INT_H_ 8 #define CORE_SRC_FPDFTEXT_TEXT_INT_H_ 9 10 class CPDF_TextParseOptions 11 { 12 public: 13 CPDF_TextParseOptions(); 14 FX_BOOL m_bCheckObjectOrder; 15 FX_BOOL m_bCheckDirection; 16 int m_nCheckSameObject; 17 }; 18 class CPDF_TextPage; 19 class CPDF_LinkExtract; 20 class CPDF_TextPageFind; 21 class CPDF_DocProgressiveSearch; 22 #define FPDFTEXT_CHAR_ERROR -1 23 #define FPDFTEXT_CHAR_NORMAL 0 24 #define FPDFTEXT_CHAR_GENERATED 1 25 #define FPDFTEXT_CHAR_UNUNICODE 2 26 #define FPDFTEXT_CHAR_HYPHEN 3 27 #define FPDFTEXT_CHAR_PIECE 4 28 #define FPDFTEXT_MC_PASS 0 29 #define FPDFTEXT_MC_DONE 1 30 #define FPDFTEXT_MC_DELAY 2 31 typedef struct _PAGECHAR_INFO { 32 int m_CharCode; 33 FX_WCHAR m_Unicode; 34 FX_FLOAT m_OriginX; 35 FX_FLOAT m_OriginY; 36 FX_INT32 m_Flag; 37 CFX_FloatRect m_CharBox; 38 CPDF_TextObject* m_pTextObj; 39 CFX_AffineMatrix m_Matrix; 40 int m_Index; 41 } PAGECHAR_INFO; 42 typedef CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray; 43 typedef struct { 44 int m_Start; 45 int m_nCount; 46 } FPDF_SEGMENT; 47 typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array; 48 typedef struct { 49 CPDF_TextObject* m_pTextObj; 50 CFX_AffineMatrix m_formMatrix; 51 } PDFTEXT_Obj; 52 typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ; 53 class CPDF_TextPage: public IPDF_TextPage 54 { 55 public: 56 CPDF_TextPage(const CPDF_Page* pPage, int flags = 0); 57 CPDF_TextPage(const CPDF_PageObjects* pPage, int flags = 0); 58 CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions); 59 virtual FX_BOOL ParseTextPage(); 60 virtual void NormalizeObjects(FX_BOOL bNormalize); IsParsered()61 virtual FX_BOOL IsParsered() const 62 { 63 return m_IsParsered; 64 } ~CPDF_TextPage()65 virtual ~CPDF_TextPage() {}; 66 public: 67 virtual int CharIndexFromTextIndex(int TextIndex)const ; 68 virtual int TextIndexFromCharIndex(int CharIndex)const; 69 virtual int CountChars() const; 70 virtual void GetCharInfo(int index, FPDF_CHAR_INFO & info) const; 71 virtual void GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const; 72 virtual int GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const; 73 virtual int GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance, 74 FX_FLOAT yTorelance) const; 75 virtual CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const; 76 virtual void GetRectsArrayByRect(const CFX_FloatRect& rect, CFX_RectArray& resRectArray) const; 77 virtual int GetOrderByDirection(int order, int direction) const; 78 virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const; 79 80 virtual int CountRects(int start, int nCount); 81 virtual void GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top 82 , FX_FLOAT& right, FX_FLOAT &bottom) const; 83 virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate); 84 virtual FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate); 85 virtual int CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, 86 FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE); 87 virtual void GetBoundedSegment(int index, int& start, int& count) const; 88 virtual int GetWordBreak(int index, int direction) const; 89 public: GetCharList()90 const PAGECHAR_InfoArray* GetCharList() const 91 { 92 return &m_charList; 93 } 94 static FX_BOOL IsRectIntersect(const CFX_FloatRect& rect1, const CFX_FloatRect& rect2); 95 static FX_BOOL IsLetter(FX_WCHAR unicode); 96 private: 97 FX_BOOL IsHyphen(FX_WCHAR curChar); 98 FX_BOOL IsControlChar(PAGECHAR_INFO* pCharInfo); 99 FX_BOOL GetBaselineRotate(int start, int end, int& Rotate); 100 void ProcessObject(); 101 void ProcessFormObject(CPDF_FormObject* pFormObj, const CFX_AffineMatrix& formMatrix); 102 void ProcessTextObject(PDFTEXT_Obj pObj); 103 void ProcessTextObject(CPDF_TextObject* pTextObj, const CFX_AffineMatrix& formMatrix, FX_POSITION ObjPos); 104 int ProcessInsertObject(const CPDF_TextObject* pObj, const CFX_AffineMatrix& formMatrix); 105 FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info); 106 FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos); 107 FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2); 108 int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const; 109 void CloseTempLine(); 110 void OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str); 111 FX_INT32 PreMarkedContent(PDFTEXT_Obj pObj); 112 void ProcessMarkedContent(PDFTEXT_Obj pObj); 113 void CheckMarkedContentObject(FX_INT32& start, FX_INT32& nCount) const; 114 void FindPreviousTextObject(void); 115 void AddCharInfoByLRDirection(CFX_WideString& str, int i); 116 void AddCharInfoByRLDirection(CFX_WideString& str, int i); 117 FX_INT32 GetTextObjectWritingMode(const CPDF_TextObject* pTextObj); 118 FX_INT32 FindTextlineFlowDirection(); 119 void SwapTempTextBuf(FX_INT32 iCharListStartAppend, 120 FX_INT32 iBufStartAppend); 121 FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj, 122 const CPDF_Font* pFont, 123 int nItems) const; 124 protected: 125 CPDFText_ParseOptions m_ParseOptions; 126 CFX_WordArray m_CharIndex; 127 const CPDF_PageObjects* m_pPage; 128 PAGECHAR_InfoArray m_charList; 129 CFX_WideTextBuf m_TextBuf; 130 PAGECHAR_InfoArray m_TempCharList; 131 CFX_WideTextBuf m_TempTextBuf; 132 int m_parserflag; 133 CPDF_TextObject* m_pPreTextObj; 134 CFX_AffineMatrix m_perMatrix; 135 FX_BOOL m_IsParsered; 136 CFX_AffineMatrix m_DisplayMatrix; 137 138 SEGMENT_Array m_Segment; 139 CFX_RectArray m_SelRects; 140 LINEOBJ m_LineObj; 141 FX_BOOL m_TextlineDir; 142 CFX_FloatRect m_CurlineRect; 143 }; 144 class CPDF_TextPageFind: public IPDF_TextPageFind 145 { 146 public: 147 CPDF_TextPageFind(const IPDF_TextPage* pTextPage); ~CPDF_TextPageFind()148 virtual ~CPDF_TextPageFind() {}; 149 public: 150 virtual FX_BOOL FindFirst(const CFX_WideString& findwhat, int flags, int startPos = 0); 151 virtual FX_BOOL FindNext(); 152 virtual FX_BOOL FindPrev(); 153 154 virtual void GetRectArray(CFX_RectArray& rects) const; 155 virtual int GetCurOrder() const; 156 virtual int GetMatchedCount()const; 157 protected: 158 void ExtractFindWhat(const CFX_WideString& findwhat); 159 FX_BOOL IsMatchWholeWord(const CFX_WideString& csPageText, int startPos, int endPos); 160 FX_BOOL ExtractSubString(CFX_WideString& rString, FX_LPCWSTR lpszFullString, 161 int iSubString, FX_WCHAR chSep); 162 CFX_WideString MakeReverse(const CFX_WideString& str); 163 int ReverseFind(const CFX_WideString& csPageText, const CFX_WideString& csWord, int nStartPos, int& WordLength); 164 int GetCharIndex(int index) const; 165 private: 166 CFX_WordArray m_CharIndex; 167 const IPDF_TextPage* m_pTextPage; 168 CFX_WideString m_strText; 169 CFX_WideString m_findWhat; 170 int m_flags; 171 CFX_WideStringArray m_csFindWhatArray; 172 int m_findNextStart; 173 int m_findPreStart; 174 FX_BOOL m_bMatchCase; 175 FX_BOOL m_bMatchWholeWord; 176 int m_resStart; 177 int m_resEnd; 178 CFX_RectArray m_resArray; 179 FX_BOOL m_IsFind; 180 }; 181 class CPDF_LinkExt 182 { 183 public: CPDF_LinkExt()184 CPDF_LinkExt() {}; 185 int m_Start; 186 int m_Count; 187 CFX_WideString m_strUrl; ~CPDF_LinkExt()188 virtual ~CPDF_LinkExt() {}; 189 }; 190 typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray; 191 class CPDF_LinkExtract: public IPDF_LinkExtract 192 { 193 public: 194 CPDF_LinkExtract(); 195 virtual ~CPDF_LinkExtract(); 196 virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage); IsExtract()197 virtual FX_BOOL IsExtract() const 198 { 199 return m_IsParserd; 200 } 201 public: 202 virtual int CountLinks() const; 203 virtual CFX_WideString GetURL(int index) const; 204 virtual void GetBoundedSegment(int index, int& start, int& count) const; 205 virtual void GetRects(int index, CFX_RectArray& rects)const; 206 protected: 207 void parserLink(); 208 void DeleteLinkList(); 209 FX_BOOL CheckWebLink(CFX_WideString& strBeCheck); 210 FX_BOOL CheckMailLink(CFX_WideString& str); 211 FX_BOOL AppendToLinkList(int start, int count, const CFX_WideString& strUrl); 212 private: 213 LINK_InfoArray m_LinkList; 214 const CPDF_TextPage* m_pTextPage; 215 CFX_WideString m_strPageText; 216 FX_BOOL m_IsParserd; 217 }; 218 FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_LPWSTR pDst); 219 void NormalizeString(CFX_WideString& str); 220 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest); 221 222 #endif // CORE_SRC_FPDFTEXT_TEXT_INT_H_ 223