1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef CORE_SRC_FPDFTEXT_TEXT_INT_H_ 8 #define CORE_SRC_FPDFTEXT_TEXT_INT_H_ 9 10 #include "core/include/fpdftext/fpdf_text.h" 11 #include "core/include/fxcrt/fx_basic.h" 12 13 class CFX_BidiChar; 14 class CPDF_DocProgressiveSearch; 15 class CPDF_FormObject; 16 class CPDF_LinkExtract; 17 class CPDF_TextPageFind; 18 19 #define FPDFTEXT_CHAR_ERROR -1 20 #define FPDFTEXT_CHAR_NORMAL 0 21 #define FPDFTEXT_CHAR_GENERATED 1 22 #define FPDFTEXT_CHAR_UNUNICODE 2 23 #define FPDFTEXT_CHAR_HYPHEN 3 24 #define FPDFTEXT_CHAR_PIECE 4 25 #define FPDFTEXT_MC_PASS 0 26 #define FPDFTEXT_MC_DONE 1 27 #define FPDFTEXT_MC_DELAY 2 28 29 typedef struct _PAGECHAR_INFO { 30 int m_CharCode; 31 FX_WCHAR m_Unicode; 32 FX_FLOAT m_OriginX; 33 FX_FLOAT m_OriginY; 34 int32_t m_Flag; 35 CFX_FloatRect m_CharBox; 36 CPDF_TextObject* m_pTextObj; 37 CFX_Matrix m_Matrix; 38 int m_Index; 39 } PAGECHAR_INFO; 40 typedef CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray; 41 typedef struct { 42 int m_Start; 43 int m_nCount; 44 } FPDF_SEGMENT; 45 typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array; 46 typedef struct { 47 CPDF_TextObject* m_pTextObj; 48 CFX_Matrix m_formMatrix; 49 } PDFTEXT_Obj; 50 typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ; 51 52 class CPDF_TextPage : public IPDF_TextPage { 53 public: 54 CPDF_TextPage(const CPDF_Page* pPage, int flags); ~CPDF_TextPage()55 ~CPDF_TextPage() override {} 56 57 // IPDF_TextPage 58 FX_BOOL ParseTextPage() override; 59 void NormalizeObjects(FX_BOOL bNormalize) override; IsParsed()60 bool IsParsed() const override { return m_bIsParsed; } 61 int CharIndexFromTextIndex(int TextIndex) const override; 62 int TextIndexFromCharIndex(int CharIndex) const override; 63 int CountChars() const override; 64 void GetCharInfo(int index, FPDF_CHAR_INFO* info) const override; 65 void GetRectArray(int start, 66 int nCount, 67 CFX_RectArray& rectArray) const override; 68 int GetIndexAtPos(CPDF_Point point, 69 FX_FLOAT xTolerance, 70 FX_FLOAT yTolerance) const override; 71 int GetIndexAtPos(FX_FLOAT x, 72 FX_FLOAT y, 73 FX_FLOAT xTolerance, 74 FX_FLOAT yTolerance) const override; 75 CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const override; 76 void GetRectsArrayByRect(const CFX_FloatRect& rect, 77 CFX_RectArray& resRectArray) const override; 78 CFX_WideString GetPageText(int start = 0, int nCount = -1) const override; 79 int CountRects(int start, int nCount) override; 80 void GetRect(int rectIndex, 81 FX_FLOAT& left, 82 FX_FLOAT& top, 83 FX_FLOAT& right, 84 FX_FLOAT& bottom) const override; 85 FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) override; 86 FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) override; 87 int CountBoundedSegments(FX_FLOAT left, 88 FX_FLOAT top, 89 FX_FLOAT right, 90 FX_FLOAT bottom, 91 FX_BOOL bContains = FALSE) override; 92 void GetBoundedSegment(int index, int& start, int& count) const override; 93 int GetWordBreak(int index, int direction) const override; 94 GetCharList()95 const PAGECHAR_InfoArray* GetCharList() const { return &m_charList; } 96 static FX_BOOL IsRectIntersect(const CFX_FloatRect& rect1, 97 const CFX_FloatRect& rect2); 98 static FX_BOOL IsLetter(FX_WCHAR unicode); 99 100 private: 101 FX_BOOL IsHyphen(FX_WCHAR curChar); 102 bool IsControlChar(const PAGECHAR_INFO& charInfo); 103 FX_BOOL GetBaselineRotate(int start, int end, int& Rotate); 104 void ProcessObject(); 105 void ProcessFormObject(CPDF_FormObject* pFormObj, 106 const CFX_Matrix& formMatrix); 107 void ProcessTextObject(PDFTEXT_Obj pObj); 108 void ProcessTextObject(CPDF_TextObject* pTextObj, 109 const CFX_Matrix& formMatrix, 110 FX_POSITION ObjPos); 111 int ProcessInsertObject(const CPDF_TextObject* pObj, 112 const CFX_Matrix& formMatrix); 113 FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info); 114 FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos); 115 FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1, 116 CPDF_TextObject* pTextObj2); 117 int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const; 118 void CloseTempLine(); 119 void OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str); 120 int32_t PreMarkedContent(PDFTEXT_Obj pObj); 121 void ProcessMarkedContent(PDFTEXT_Obj pObj); 122 void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const; 123 void FindPreviousTextObject(void); 124 void AddCharInfoByLRDirection(CFX_WideString& str, int i); 125 void AddCharInfoByRLDirection(CFX_WideString& str, int i); 126 int32_t GetTextObjectWritingMode(const CPDF_TextObject* pTextObj); 127 int32_t FindTextlineFlowDirection(); 128 129 void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend); 130 FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj, 131 const CPDF_Font* pFont, 132 int nItems) const; 133 134 CPDFText_ParseOptions m_ParseOptions; 135 CFX_WordArray m_CharIndex; 136 const CPDF_PageObjects* const m_pPage; 137 PAGECHAR_InfoArray m_charList; 138 CFX_WideTextBuf m_TextBuf; 139 PAGECHAR_InfoArray m_TempCharList; 140 CFX_WideTextBuf m_TempTextBuf; 141 const int m_parserflag; 142 CPDF_TextObject* m_pPreTextObj; 143 CFX_Matrix m_perMatrix; 144 bool m_bIsParsed; 145 CFX_Matrix m_DisplayMatrix; 146 SEGMENT_Array m_Segment; 147 CFX_RectArray m_SelRects; 148 LINEOBJ m_LineObj; 149 int32_t m_TextlineDir; 150 CFX_FloatRect m_CurlineRect; 151 }; 152 153 class CPDF_TextPageFind : public IPDF_TextPageFind { 154 public: 155 explicit CPDF_TextPageFind(const IPDF_TextPage* pTextPage); ~CPDF_TextPageFind()156 ~CPDF_TextPageFind() override {} 157 158 // IPDF_TextPageFind 159 FX_BOOL FindFirst(const CFX_WideString& findwhat, 160 int flags, 161 int startPos = 0) override; 162 FX_BOOL FindNext() override; 163 FX_BOOL FindPrev() override; 164 void GetRectArray(CFX_RectArray& rects) const override; 165 int GetCurOrder() const override; 166 int GetMatchedCount() const override; 167 168 protected: 169 void ExtractFindWhat(const CFX_WideString& findwhat); 170 FX_BOOL IsMatchWholeWord(const CFX_WideString& csPageText, 171 int startPos, 172 int endPos); 173 FX_BOOL ExtractSubString(CFX_WideString& rString, 174 const FX_WCHAR* lpszFullString, 175 int iSubString, 176 FX_WCHAR chSep); 177 CFX_WideString MakeReverse(const CFX_WideString& str); 178 int ReverseFind(const CFX_WideString& csPageText, 179 const CFX_WideString& csWord, 180 int nStartPos, 181 int& WordLength); 182 int GetCharIndex(int index) const; 183 184 private: 185 CFX_WordArray m_CharIndex; 186 const IPDF_TextPage* m_pTextPage; 187 CFX_WideString m_strText; 188 CFX_WideString m_findWhat; 189 int m_flags; 190 CFX_WideStringArray m_csFindWhatArray; 191 int m_findNextStart; 192 int m_findPreStart; 193 FX_BOOL m_bMatchCase; 194 FX_BOOL m_bMatchWholeWord; 195 int m_resStart; 196 int m_resEnd; 197 CFX_RectArray m_resArray; 198 FX_BOOL m_IsFind; 199 }; 200 201 class CPDF_LinkExt { 202 public: CPDF_LinkExt()203 CPDF_LinkExt() {} 204 int m_Start; 205 int m_Count; 206 CFX_WideString m_strUrl; ~CPDF_LinkExt()207 virtual ~CPDF_LinkExt() {} 208 }; 209 210 typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray; 211 212 class CPDF_LinkExtract : public IPDF_LinkExtract { 213 public: 214 CPDF_LinkExtract(); 215 ~CPDF_LinkExtract() override; 216 217 // IPDF_LinkExtract 218 FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) override; 219 int CountLinks() const override; 220 CFX_WideString GetURL(int index) const override; 221 void GetBoundedSegment(int index, int& start, int& count) const override; 222 void GetRects(int index, CFX_RectArray& rects) const override; 223 IsExtract()224 FX_BOOL IsExtract() const { return m_bIsParsed; } 225 226 protected: 227 void ParseLink(); 228 void DeleteLinkList(); 229 FX_BOOL CheckWebLink(CFX_WideString& strBeCheck); 230 bool CheckMailLink(CFX_WideString& str); 231 void AppendToLinkList(int start, int count, const CFX_WideString& strUrl); 232 233 private: 234 LINK_InfoArray m_LinkList; 235 const CPDF_TextPage* m_pTextPage; 236 CFX_WideString m_strPageText; 237 bool m_bIsParsed; 238 }; 239 240 FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst); 241 void NormalizeString(CFX_WideString& str); 242 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest); 243 void GetTextStream_Unicode(CFX_WideTextBuf& buffer, 244 CPDF_PageObjects* pPage, 245 FX_BOOL bUseLF, 246 CFX_PtrArray* pObjArray); 247 248 #endif // CORE_SRC_FPDFTEXT_TEXT_INT_H_ 249