1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "public/fpdf_text.h"
8 
9 #include <algorithm>
10 #include <vector>
11 
12 #include "core/fpdfapi/page/cpdf_page.h"
13 #include "core/fpdfdoc/cpdf_viewerpreferences.h"
14 #include "core/fpdftext/cpdf_linkextract.h"
15 #include "core/fpdftext/cpdf_textpage.h"
16 #include "core/fpdftext/cpdf_textpagefind.h"
17 #include "fpdfsdk/fsdk_define.h"
18 #include "third_party/base/numerics/safe_conversions.h"
19 #include "third_party/base/stl_util.h"
20 
21 #ifdef PDF_ENABLE_XFA
22 #include "fpdfsdk/fpdfxfa/cpdfxfa_context.h"
23 #include "fpdfsdk/fpdfxfa/cpdfxfa_page.h"
24 #endif  // PDF_ENABLE_XFA
25 
26 #ifdef _WIN32
27 #include <tchar.h>
28 #endif
29 
30 namespace {
31 
32 constexpr size_t kBytesPerCharacter = sizeof(unsigned short);
33 
CPDFTextPageFromFPDFTextPage(FPDF_TEXTPAGE text_page)34 CPDF_TextPage* CPDFTextPageFromFPDFTextPage(FPDF_TEXTPAGE text_page) {
35   return static_cast<CPDF_TextPage*>(text_page);
36 }
37 
CPDFTextPageFindFromFPDFSchHandle(FPDF_SCHHANDLE handle)38 CPDF_TextPageFind* CPDFTextPageFindFromFPDFSchHandle(FPDF_SCHHANDLE handle) {
39   return static_cast<CPDF_TextPageFind*>(handle);
40 }
41 
CPDFLinkExtractFromFPDFPageLink(FPDF_PAGELINK link)42 CPDF_LinkExtract* CPDFLinkExtractFromFPDFPageLink(FPDF_PAGELINK link) {
43   return static_cast<CPDF_LinkExtract*>(link);
44 }
45 
46 }  // namespace
47 
FPDFText_LoadPage(FPDF_PAGE page)48 FPDF_EXPORT FPDF_TEXTPAGE FPDF_CALLCONV FPDFText_LoadPage(FPDF_PAGE page) {
49   CPDF_Page* pPDFPage = CPDFPageFromFPDFPage(page);
50   if (!pPDFPage)
51     return nullptr;
52 
53 #ifdef PDF_ENABLE_XFA
54   CPDFXFA_Page* pPage = (CPDFXFA_Page*)page;
55   CPDFXFA_Context* pContext = pPage->GetContext();
56   CPDF_ViewerPreferences viewRef(pContext->GetPDFDoc());
57 #else  // PDF_ENABLE_XFA
58   CPDF_ViewerPreferences viewRef(pPDFPage->m_pDocument.Get());
59 #endif  // PDF_ENABLE_XFA
60 
61   CPDF_TextPage* textpage = new CPDF_TextPage(
62       pPDFPage, viewRef.IsDirectionR2L() ? FPDFText_Direction::Right
63                                          : FPDFText_Direction::Left);
64   textpage->ParseTextPage();
65   return textpage;
66 }
67 
FPDFText_ClosePage(FPDF_TEXTPAGE text_page)68 FPDF_EXPORT void FPDF_CALLCONV FPDFText_ClosePage(FPDF_TEXTPAGE text_page) {
69   delete CPDFTextPageFromFPDFTextPage(text_page);
70 }
71 
FPDFText_CountChars(FPDF_TEXTPAGE text_page)72 FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountChars(FPDF_TEXTPAGE text_page) {
73   if (!text_page)
74     return -1;
75 
76   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
77   return textpage->CountChars();
78 }
79 
80 FPDF_EXPORT unsigned int FPDF_CALLCONV
FPDFText_GetUnicode(FPDF_TEXTPAGE text_page,int index)81 FPDFText_GetUnicode(FPDF_TEXTPAGE text_page, int index) {
82   if (!text_page)
83     return 0;
84 
85   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
86   if (index < 0 || index >= textpage->CountChars())
87     return 0;
88 
89   FPDF_CHAR_INFO charinfo;
90   textpage->GetCharInfo(index, &charinfo);
91   return charinfo.m_Unicode;
92 }
93 
FPDFText_GetFontSize(FPDF_TEXTPAGE text_page,int index)94 FPDF_EXPORT double FPDF_CALLCONV FPDFText_GetFontSize(FPDF_TEXTPAGE text_page,
95                                                       int index) {
96   if (!text_page)
97     return 0;
98   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
99 
100   if (index < 0 || index >= textpage->CountChars())
101     return 0;
102 
103   FPDF_CHAR_INFO charinfo;
104   textpage->GetCharInfo(index, &charinfo);
105   return charinfo.m_FontSize;
106 }
107 
FPDFText_GetCharBox(FPDF_TEXTPAGE text_page,int index,double * left,double * right,double * bottom,double * top)108 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_GetCharBox(FPDF_TEXTPAGE text_page,
109                                                         int index,
110                                                         double* left,
111                                                         double* right,
112                                                         double* bottom,
113                                                         double* top) {
114   if (!text_page || index < 0)
115     return false;
116 
117   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
118   if (index >= textpage->CountChars())
119     return false;
120 
121   FPDF_CHAR_INFO charinfo;
122   textpage->GetCharInfo(index, &charinfo);
123   *left = charinfo.m_CharBox.left;
124   *right = charinfo.m_CharBox.right;
125   *bottom = charinfo.m_CharBox.bottom;
126   *top = charinfo.m_CharBox.top;
127   return true;
128 }
129 
130 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
FPDFText_GetCharOrigin(FPDF_TEXTPAGE text_page,int index,double * x,double * y)131 FPDFText_GetCharOrigin(FPDF_TEXTPAGE text_page,
132                        int index,
133                        double* x,
134                        double* y) {
135   if (!text_page)
136     return false;
137   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
138 
139   if (index < 0 || index >= textpage->CountChars())
140     return false;
141   FPDF_CHAR_INFO charinfo;
142   textpage->GetCharInfo(index, &charinfo);
143   *x = charinfo.m_Origin.x;
144   *y = charinfo.m_Origin.y;
145   return true;
146 }
147 
148 // select
149 FPDF_EXPORT int FPDF_CALLCONV
FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,double x,double y,double xTolerance,double yTolerance)150 FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,
151                            double x,
152                            double y,
153                            double xTolerance,
154                            double yTolerance) {
155   if (!text_page)
156     return -3;
157 
158   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
159   return textpage->GetIndexAtPos(
160       CFX_PointF(static_cast<float>(x), static_cast<float>(y)),
161       CFX_SizeF(static_cast<float>(xTolerance),
162                 static_cast<float>(yTolerance)));
163 }
164 
FPDFText_GetText(FPDF_TEXTPAGE page,int char_start,int char_count,unsigned short * result)165 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE page,
166                                                int char_start,
167                                                int char_count,
168                                                unsigned short* result) {
169   if (!page || char_start < 0 || char_count < 0 || !result)
170     return 0;
171 
172   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page);
173   int char_available = textpage->CountChars() - char_start;
174   if (char_available <= 0)
175     return 0;
176 
177   char_count = std::min(char_count, char_available);
178   if (char_count == 0) {
179     // Writing out "", which has a character count of 1 due to the NUL.
180     *result = '\0';
181     return 1;
182   }
183 
184   WideString str = textpage->GetPageText(char_start, char_count);
185 
186   if (str.GetLength() > static_cast<size_t>(char_count))
187     str = str.Left(static_cast<size_t>(char_count));
188 
189   // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
190   // the number of items to stay the same.
191   ByteString byte_str = str.UTF16LE_Encode();
192   size_t byte_str_len = byte_str.GetLength();
193   int ret_count = byte_str_len / kBytesPerCharacter;
194 
195   ASSERT(ret_count <= char_count + 1);  // +1 to account for the NUL terminator.
196   memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len);
197   return ret_count;
198 }
199 
FPDFText_CountRects(FPDF_TEXTPAGE text_page,int start,int count)200 FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountRects(FPDF_TEXTPAGE text_page,
201                                                   int start,
202                                                   int count) {
203   if (!text_page)
204     return 0;
205 
206   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
207   return textpage->CountRects(start, count);
208 }
209 
FPDFText_GetRect(FPDF_TEXTPAGE text_page,int rect_index,double * left,double * top,double * right,double * bottom)210 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_GetRect(FPDF_TEXTPAGE text_page,
211                                                      int rect_index,
212                                                      double* left,
213                                                      double* top,
214                                                      double* right,
215                                                      double* bottom) {
216   if (!text_page)
217     return false;
218 
219   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
220   CFX_FloatRect rect;
221   bool result = textpage->GetRect(rect_index, &rect);
222 
223   *left = rect.left;
224   *top = rect.top;
225   *right = rect.right;
226   *bottom = rect.bottom;
227   return result;
228 }
229 
FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,double left,double top,double right,double bottom,unsigned short * buffer,int buflen)230 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,
231                                                       double left,
232                                                       double top,
233                                                       double right,
234                                                       double bottom,
235                                                       unsigned short* buffer,
236                                                       int buflen) {
237   if (!text_page)
238     return 0;
239 
240   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
241   CFX_FloatRect rect((float)left, (float)bottom, (float)right, (float)top);
242   WideString str = textpage->GetTextByRect(rect);
243 
244   if (buflen <= 0 || !buffer)
245     return str.GetLength();
246 
247   ByteString cbUTF16Str = str.UTF16LE_Encode();
248   int len = cbUTF16Str.GetLength() / sizeof(unsigned short);
249   int size = buflen > len ? len : buflen;
250   memcpy(buffer, cbUTF16Str.GetBuffer(size * sizeof(unsigned short)),
251          size * sizeof(unsigned short));
252   cbUTF16Str.ReleaseBuffer(size * sizeof(unsigned short));
253 
254   return size;
255 }
256 
257 // Search
258 // -1 for end
259 FPDF_EXPORT FPDF_SCHHANDLE FPDF_CALLCONV
FPDFText_FindStart(FPDF_TEXTPAGE text_page,FPDF_WIDESTRING findwhat,unsigned long flags,int start_index)260 FPDFText_FindStart(FPDF_TEXTPAGE text_page,
261                    FPDF_WIDESTRING findwhat,
262                    unsigned long flags,
263                    int start_index) {
264   if (!text_page)
265     return nullptr;
266 
267   CPDF_TextPageFind* textpageFind =
268       new CPDF_TextPageFind(CPDFTextPageFromFPDFTextPage(text_page));
269   size_t len = WideString::WStringLength(findwhat);
270   textpageFind->FindFirst(
271       WideString::FromUTF16LE(findwhat, len), flags,
272       start_index >= 0 ? Optional<size_t>(start_index) : Optional<size_t>());
273   return textpageFind;
274 }
275 
FPDFText_FindNext(FPDF_SCHHANDLE handle)276 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_FindNext(FPDF_SCHHANDLE handle) {
277   if (!handle)
278     return false;
279 
280   CPDF_TextPageFind* textpageFind = CPDFTextPageFindFromFPDFSchHandle(handle);
281   return textpageFind->FindNext();
282 }
283 
FPDFText_FindPrev(FPDF_SCHHANDLE handle)284 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_FindPrev(FPDF_SCHHANDLE handle) {
285   if (!handle)
286     return false;
287 
288   CPDF_TextPageFind* textpageFind = CPDFTextPageFindFromFPDFSchHandle(handle);
289   return textpageFind->FindPrev();
290 }
291 
292 FPDF_EXPORT int FPDF_CALLCONV
FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle)293 FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle) {
294   if (!handle)
295     return 0;
296 
297   CPDF_TextPageFind* textpageFind = CPDFTextPageFindFromFPDFSchHandle(handle);
298   return textpageFind->GetCurOrder();
299 }
300 
FPDFText_GetSchCount(FPDF_SCHHANDLE handle)301 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetSchCount(FPDF_SCHHANDLE handle) {
302   if (!handle)
303     return 0;
304 
305   CPDF_TextPageFind* textpageFind = CPDFTextPageFindFromFPDFSchHandle(handle);
306   return textpageFind->GetMatchedCount();
307 }
308 
FPDFText_FindClose(FPDF_SCHHANDLE handle)309 FPDF_EXPORT void FPDF_CALLCONV FPDFText_FindClose(FPDF_SCHHANDLE handle) {
310   if (!handle)
311     return;
312 
313   CPDF_TextPageFind* textpageFind = CPDFTextPageFindFromFPDFSchHandle(handle);
314   delete textpageFind;
315   handle = nullptr;
316 }
317 
318 // web link
319 FPDF_EXPORT FPDF_PAGELINK FPDF_CALLCONV
FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page)320 FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page) {
321   if (!text_page)
322     return nullptr;
323 
324   CPDF_LinkExtract* pageLink =
325       new CPDF_LinkExtract(CPDFTextPageFromFPDFTextPage(text_page));
326   pageLink->ExtractLinks();
327   return pageLink;
328 }
329 
FPDFLink_CountWebLinks(FPDF_PAGELINK link_page)330 FPDF_EXPORT int FPDF_CALLCONV FPDFLink_CountWebLinks(FPDF_PAGELINK link_page) {
331   if (!link_page)
332     return 0;
333 
334   CPDF_LinkExtract* pageLink = CPDFLinkExtractFromFPDFPageLink(link_page);
335   return pdfium::base::checked_cast<int>(pageLink->CountLinks());
336 }
337 
FPDFLink_GetURL(FPDF_PAGELINK link_page,int link_index,unsigned short * buffer,int buflen)338 FPDF_EXPORT int FPDF_CALLCONV FPDFLink_GetURL(FPDF_PAGELINK link_page,
339                                               int link_index,
340                                               unsigned short* buffer,
341                                               int buflen) {
342   WideString wsUrl(L"");
343   if (link_page && link_index >= 0) {
344     CPDF_LinkExtract* pageLink = CPDFLinkExtractFromFPDFPageLink(link_page);
345     wsUrl = pageLink->GetURL(link_index);
346   }
347   ByteString cbUTF16URL = wsUrl.UTF16LE_Encode();
348   int required = cbUTF16URL.GetLength() / sizeof(unsigned short);
349   if (!buffer || buflen <= 0)
350     return required;
351 
352   int size = std::min(required, buflen);
353   if (size > 0) {
354     int buf_size = size * sizeof(unsigned short);
355     memcpy(buffer, cbUTF16URL.GetBuffer(buf_size), buf_size);
356   }
357   return size;
358 }
359 
FPDFLink_CountRects(FPDF_PAGELINK link_page,int link_index)360 FPDF_EXPORT int FPDF_CALLCONV FPDFLink_CountRects(FPDF_PAGELINK link_page,
361                                                   int link_index) {
362   if (!link_page || link_index < 0)
363     return 0;
364 
365   CPDF_LinkExtract* pageLink = CPDFLinkExtractFromFPDFPageLink(link_page);
366   return pdfium::CollectionSize<int>(pageLink->GetRects(link_index));
367 }
368 
FPDFLink_GetRect(FPDF_PAGELINK link_page,int link_index,int rect_index,double * left,double * top,double * right,double * bottom)369 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFLink_GetRect(FPDF_PAGELINK link_page,
370                                                      int link_index,
371                                                      int rect_index,
372                                                      double* left,
373                                                      double* top,
374                                                      double* right,
375                                                      double* bottom) {
376   if (!link_page || link_index < 0 || rect_index < 0)
377     return false;
378 
379   CPDF_LinkExtract* pageLink = CPDFLinkExtractFromFPDFPageLink(link_page);
380   std::vector<CFX_FloatRect> rectArray = pageLink->GetRects(link_index);
381   if (rect_index >= pdfium::CollectionSize<int>(rectArray))
382     return false;
383 
384   *left = rectArray[rect_index].left;
385   *right = rectArray[rect_index].right;
386   *top = rectArray[rect_index].top;
387   *bottom = rectArray[rect_index].bottom;
388   return true;
389 }
390 
FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page)391 FPDF_EXPORT void FPDF_CALLCONV FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page) {
392   delete CPDFLinkExtractFromFPDFPageLink(link_page);
393 }
394