1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ 8 #define CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ 9 10 #include "../fpdfapi/fpdf_page.h" 11 #include "../fpdfapi/fpdf_pageobj.h" 12 #include "../fpdfapi/fpdf_parser.h" 13 14 class CPDF_PageObjects; 15 class IPDF_LinkExtract; 16 class IPDF_ReflowedPage; 17 class IPDF_TextPage; 18 class IPDF_TextPageFind; 19 20 #define PDF2TXT_AUTO_ROTATE 1 21 #define PDF2TXT_AUTO_WIDTH 2 22 #define PDF2TXT_KEEP_COLUMN 4 23 #define PDF2TXT_USE_OCR 8 24 #define PDF2TXT_INCLUDE_INVISIBLE 16 25 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 26 int iMinWidth, FX_DWORD flags); 27 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 28 int iMinWidth, FX_DWORD flags); 29 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 30 FX_DWORD flags); 31 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage); 32 #define CHAR_ERROR -1 33 #define CHAR_NORMAL 0 34 #define CHAR_GENERATED 1 35 #define CHAR_UNUNICODE 2 36 typedef struct { 37 FX_WCHAR m_Unicode; 38 FX_WCHAR m_Charcode; 39 FX_INT32 m_Flag; 40 FX_FLOAT m_FontSize; 41 FX_FLOAT m_OriginX; 42 FX_FLOAT m_OriginY; 43 CFX_FloatRect m_CharBox; 44 CPDF_TextObject* m_pTextObj; 45 CFX_AffineMatrix m_Matrix; 46 } FPDF_CHAR_INFO; 47 typedef CFX_ArrayTemplate<CFX_FloatRect> CFX_RectArray; 48 #define FPDFTEXT_LRTB 0 49 #define FPDFTEXT_RLTB 1 50 #define FPDFTEXT_TBRL 2 51 #define FPDFTEXT_LEFT -1 52 #define FPDFTEXT_RIGHT 1 53 #define FPDFTEXT_UP -2 54 #define FPDFTEXT_DOWN 2 55 #define FPDFTEXT_WRITINGMODE_UNKNOW 0 56 #define FPDFTEXT_WRITINGMODE_LRTB 1 57 #define FPDFTEXT_WRITINGMODE_RLTB 2 58 #define FPDFTEXT_WRITINGMODE_TBRL 3 59 class CPDFText_ParseOptions 60 { 61 public: 62 63 CPDFText_ParseOptions(); 64 FX_BOOL m_bGetCharCodeOnly; 65 FX_BOOL m_bNormalizeObjs; 66 FX_BOOL m_bOutputHyphen; 67 }; 68 class IPDF_TextPage 69 { 70 public: 71 ~IPDF_TextPage()72 virtual ~IPDF_TextPage() {} 73 static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions); 74 static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, int flags = 0); 75 static IPDF_TextPage* CreateTextPage(const CPDF_PageObjects* pObjs, int flags = 0); 76 static IPDF_TextPage* CreateReflowTextPage(IPDF_ReflowedPage* pRefPage); 77 78 virtual void NormalizeObjects(FX_BOOL bNormalize) = 0; 79 80 virtual FX_BOOL ParseTextPage() = 0; 81 82 83 virtual FX_BOOL IsParsered() const = 0; 84 public: 85 86 virtual int CharIndexFromTextIndex(int TextIndex) const = 0; 87 88 virtual int TextIndexFromCharIndex(int CharIndex) const = 0; 89 90 91 virtual int CountChars() const = 0; 92 93 virtual void GetCharInfo(int index, FPDF_CHAR_INFO & info) const = 0; 94 95 virtual void GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const = 0; 96 97 98 99 virtual int GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const = 0; 100 101 virtual int GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const = 0; 102 103 virtual int GetOrderByDirection(int index, int direction) const = 0; 104 105 virtual CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const = 0; 106 107 virtual void GetRectsArrayByRect(const CFX_FloatRect& rect, CFX_RectArray& resRectArray) const = 0; 108 109 110 virtual int CountRects(int start, int nCount) = 0; 111 112 virtual void GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top, FX_FLOAT& right, FX_FLOAT &bottom) const = 0; 113 114 virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) = 0; 115 116 virtual FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) = 0; 117 118 virtual int CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE) = 0; 119 120 virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; 121 122 123 virtual int GetWordBreak(int index, int direction) const = 0; 124 125 virtual CFX_WideString GetPageText(int start = 0, int nCount = -1 ) const = 0; 126 }; 127 #define FPDFTEXT_MATCHCASE 0x00000001 128 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 129 #define FPDFTEXT_CONSECUTIVE 0x00000004 130 class IPDF_TextPageFind 131 { 132 public: 133 ~IPDF_TextPageFind()134 virtual ~IPDF_TextPageFind() {} 135 136 static IPDF_TextPageFind* CreatePageFind(const IPDF_TextPage* pTextPage); 137 public: 138 139 virtual FX_BOOL FindFirst(const CFX_WideString& findwhat, int flags, int startPos = 0) = 0; 140 141 virtual FX_BOOL FindNext() = 0; 142 143 virtual FX_BOOL FindPrev() = 0; 144 145 virtual void GetRectArray(CFX_RectArray& rects) const = 0; 146 147 virtual int GetCurOrder() const = 0; 148 149 virtual int GetMatchedCount() const = 0; 150 }; 151 class IPDF_LinkExtract 152 { 153 public: 154 ~IPDF_LinkExtract()155 virtual ~IPDF_LinkExtract() {} 156 157 static IPDF_LinkExtract* CreateLinkExtract(); 158 159 virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) = 0; 160 public: 161 162 virtual int CountLinks() const = 0; 163 164 virtual CFX_WideString GetURL(int index) const = 0; 165 166 virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; 167 168 virtual void GetRects(int index, CFX_RectArray& rects) const = 0; 169 }; 170 171 #endif // CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ 172