1 // Copyright 2015 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <memory>
6 
7 #include "core/fxcrt/fx_memory.h"
8 #include "public/fpdf_text.h"
9 #include "public/fpdfview.h"
10 #include "testing/embedder_test.h"
11 #include "testing/gtest/include/gtest/gtest.h"
12 #include "testing/test_support.h"
13 
14 namespace {
15 
check_unsigned_shorts(const char * expected,const unsigned short * actual,size_t length)16 bool check_unsigned_shorts(const char* expected,
17                            const unsigned short* actual,
18                            size_t length) {
19   if (length > strlen(expected) + 1)
20     return false;
21 
22   for (size_t i = 0; i < length; ++i) {
23     if (actual[i] != static_cast<unsigned short>(expected[i]))
24       return false;
25   }
26   return true;
27 }
28 
29 }  // namespace
30 
31 class FPDFTextEmbeddertest : public EmbedderTest {};
32 
TEST_F(FPDFTextEmbeddertest,Text)33 TEST_F(FPDFTextEmbeddertest, Text) {
34   EXPECT_TRUE(OpenDocument("hello_world.pdf"));
35   FPDF_PAGE page = LoadPage(0);
36   EXPECT_TRUE(page);
37 
38   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
39   EXPECT_TRUE(textpage);
40 
41   static const char expected[] = "Hello, world!\r\nGoodbye, world!";
42   unsigned short fixed_buffer[128];
43   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
44 
45   // Check that edge cases are handled gracefully
46   EXPECT_EQ(0, FPDFText_GetText(textpage, 0, 128, nullptr));
47   EXPECT_EQ(0, FPDFText_GetText(textpage, -1, 128, fixed_buffer));
48   EXPECT_EQ(0, FPDFText_GetText(textpage, 0, -1, fixed_buffer));
49   EXPECT_EQ(1, FPDFText_GetText(textpage, 0, 0, fixed_buffer));
50   EXPECT_EQ(0, fixed_buffer[0]);
51 
52   // Keep going and check the next case.
53   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
54   EXPECT_EQ(2, FPDFText_GetText(textpage, 0, 1, fixed_buffer));
55   EXPECT_EQ(expected[0], fixed_buffer[0]);
56   EXPECT_EQ(0, fixed_buffer[1]);
57 
58   // Check includes the terminating NUL that is provided.
59   int num_chars = FPDFText_GetText(textpage, 0, 128, fixed_buffer);
60   ASSERT_GE(num_chars, 0);
61   EXPECT_EQ(sizeof(expected), static_cast<size_t>(num_chars));
62   EXPECT_TRUE(check_unsigned_shorts(expected, fixed_buffer, sizeof(expected)));
63 
64   // Count does not include the terminating NUL in the string literal.
65   EXPECT_EQ(sizeof(expected) - 1,
66             static_cast<size_t>(FPDFText_CountChars(textpage)));
67   for (size_t i = 0; i < sizeof(expected) - 1; ++i) {
68     EXPECT_EQ(static_cast<unsigned int>(expected[i]),
69               FPDFText_GetUnicode(textpage, i))
70         << " at " << i;
71   }
72 
73   // Extracting using a buffer that will be completely filled. Small buffer is
74   // 12 elements long, since it will need 2 locations per displayed character in
75   // the expected string, plus 2 more for the terminating character.
76   static const char small_expected[] = "Hello";
77   unsigned short small_buffer[12];
78   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
79   EXPECT_EQ(6, FPDFText_GetText(textpage, 0, 5, small_buffer));
80   EXPECT_TRUE(check_unsigned_shorts(small_expected, small_buffer,
81                                     sizeof(small_expected)));
82 
83   EXPECT_EQ(12.0, FPDFText_GetFontSize(textpage, 0));
84   EXPECT_EQ(16.0, FPDFText_GetFontSize(textpage, 15));
85 
86   double left = 0.0;
87   double right = 0.0;
88   double bottom = 0.0;
89   double top = 0.0;
90   EXPECT_FALSE(FPDFText_GetCharBox(nullptr, 4, &left, &right, &bottom, &top));
91   EXPECT_DOUBLE_EQ(0.0, left);
92   EXPECT_DOUBLE_EQ(0.0, right);
93   EXPECT_DOUBLE_EQ(0.0, bottom);
94   EXPECT_DOUBLE_EQ(0.0, top);
95   EXPECT_FALSE(FPDFText_GetCharBox(textpage, -1, &left, &right, &bottom, &top));
96   EXPECT_DOUBLE_EQ(0.0, left);
97   EXPECT_DOUBLE_EQ(0.0, right);
98   EXPECT_DOUBLE_EQ(0.0, bottom);
99   EXPECT_DOUBLE_EQ(0.0, top);
100   EXPECT_FALSE(FPDFText_GetCharBox(textpage, 55, &left, &right, &bottom, &top));
101   EXPECT_DOUBLE_EQ(0.0, left);
102   EXPECT_DOUBLE_EQ(0.0, right);
103   EXPECT_DOUBLE_EQ(0.0, bottom);
104   EXPECT_DOUBLE_EQ(0.0, top);
105 
106   EXPECT_TRUE(FPDFText_GetCharBox(textpage, 4, &left, &right, &bottom, &top));
107   EXPECT_NEAR(41.071, left, 0.001);
108   EXPECT_NEAR(46.243, right, 0.001);
109   EXPECT_NEAR(49.844, bottom, 0.001);
110   EXPECT_NEAR(55.520, top, 0.001);
111 
112   double x = 0.0;
113   double y = 0.0;
114   EXPECT_TRUE(FPDFText_GetCharOrigin(textpage, 4, &x, &y));
115   EXPECT_NEAR(40.664, x, 0.001);
116   EXPECT_NEAR(50.000, y, 0.001);
117 
118   EXPECT_EQ(4, FPDFText_GetCharIndexAtPos(textpage, 42.0, 50.0, 1.0, 1.0));
119   EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, 0.0, 0.0, 1.0, 1.0));
120   EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, 199.0, 199.0, 1.0, 1.0));
121 
122   // Test out of range indicies.
123   EXPECT_EQ(-1,
124             FPDFText_GetCharIndexAtPos(textpage, 42.0, 10000000.0, 1.0, 1.0));
125   EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, -1.0, 50.0, 1.0, 1.0));
126 
127   // Count does not include the terminating NUL in the string literal.
128   EXPECT_EQ(2, FPDFText_CountRects(textpage, 0, sizeof(expected) - 1));
129 
130   left = 0.0;
131   right = 0.0;
132   bottom = 0.0;
133   top = 0.0;
134   EXPECT_TRUE(FPDFText_GetRect(textpage, 1, &left, &top, &right, &bottom));
135   EXPECT_NEAR(20.847, left, 0.001);
136   EXPECT_NEAR(135.167, right, 0.001);
137   EXPECT_NEAR(96.655, bottom, 0.001);
138   EXPECT_NEAR(116.000, top, 0.001);
139 
140   // Test out of range indicies set outputs to (0.0, 0.0, 0.0, 0.0).
141   left = -1.0;
142   right = -1.0;
143   bottom = -1.0;
144   top = -1.0;
145   EXPECT_FALSE(FPDFText_GetRect(textpage, -1, &left, &top, &right, &bottom));
146   EXPECT_EQ(0.0, left);
147   EXPECT_EQ(0.0, right);
148   EXPECT_EQ(0.0, bottom);
149   EXPECT_EQ(0.0, top);
150 
151   left = -2.0;
152   right = -2.0;
153   bottom = -2.0;
154   top = -2.0;
155   EXPECT_FALSE(FPDFText_GetRect(textpage, 2, &left, &top, &right, &bottom));
156   EXPECT_EQ(0.0, left);
157   EXPECT_EQ(0.0, right);
158   EXPECT_EQ(0.0, bottom);
159   EXPECT_EQ(0.0, top);
160 
161   EXPECT_EQ(9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, 0, 0));
162 
163   // Extract starting at character 4 as above.
164   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
165   EXPECT_EQ(1, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0,
166                                        fixed_buffer, 1));
167   EXPECT_TRUE(check_unsigned_shorts(expected + 4, fixed_buffer, 1));
168   EXPECT_EQ(0xbdbd, fixed_buffer[1]);
169 
170   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
171   EXPECT_EQ(9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0,
172                                        fixed_buffer, 9));
173   EXPECT_TRUE(check_unsigned_shorts(expected + 4, fixed_buffer, 9));
174   EXPECT_EQ(0xbdbd, fixed_buffer[9]);
175 
176   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
177   EXPECT_EQ(10, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0,
178                                         fixed_buffer, 128));
179   EXPECT_TRUE(check_unsigned_shorts(expected + 4, fixed_buffer, 9));
180   EXPECT_EQ(0u, fixed_buffer[9]);
181   EXPECT_EQ(0xbdbd, fixed_buffer[10]);
182 
183   FPDFText_ClosePage(textpage);
184   UnloadPage(page);
185 }
186 
TEST_F(FPDFTextEmbeddertest,TextSearch)187 TEST_F(FPDFTextEmbeddertest, TextSearch) {
188   EXPECT_TRUE(OpenDocument("hello_world.pdf"));
189   FPDF_PAGE page = LoadPage(0);
190   EXPECT_TRUE(page);
191 
192   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
193   EXPECT_TRUE(textpage);
194 
195   std::unique_ptr<unsigned short, pdfium::FreeDeleter> nope =
196       GetFPDFWideString(L"nope");
197   std::unique_ptr<unsigned short, pdfium::FreeDeleter> world =
198       GetFPDFWideString(L"world");
199   std::unique_ptr<unsigned short, pdfium::FreeDeleter> world_caps =
200       GetFPDFWideString(L"WORLD");
201   std::unique_ptr<unsigned short, pdfium::FreeDeleter> world_substr =
202       GetFPDFWideString(L"orld");
203 
204   // No occurences of "nope" in test page.
205   FPDF_SCHHANDLE search = FPDFText_FindStart(textpage, nope.get(), 0, 0);
206   EXPECT_TRUE(search);
207   EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
208   EXPECT_EQ(0, FPDFText_GetSchCount(search));
209 
210   // Advancing finds nothing.
211   EXPECT_FALSE(FPDFText_FindNext(search));
212   EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
213   EXPECT_EQ(0, FPDFText_GetSchCount(search));
214 
215   // Retreating finds nothing.
216   EXPECT_FALSE(FPDFText_FindPrev(search));
217   EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
218   EXPECT_EQ(0, FPDFText_GetSchCount(search));
219   FPDFText_FindClose(search);
220 
221   // Two occurences of "world" in test page.
222   search = FPDFText_FindStart(textpage, world.get(), 0, 2);
223   EXPECT_TRUE(search);
224 
225   // Remains not found until advanced.
226   EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
227   EXPECT_EQ(0, FPDFText_GetSchCount(search));
228 
229   // First occurence of "world" in this test page.
230   EXPECT_TRUE(FPDFText_FindNext(search));
231   EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
232   EXPECT_EQ(5, FPDFText_GetSchCount(search));
233 
234   // Last occurence of "world" in this test page.
235   EXPECT_TRUE(FPDFText_FindNext(search));
236   EXPECT_EQ(24, FPDFText_GetSchResultIndex(search));
237   EXPECT_EQ(5, FPDFText_GetSchCount(search));
238 
239   // Found position unchanged when fails to advance.
240   EXPECT_FALSE(FPDFText_FindNext(search));
241   EXPECT_EQ(24, FPDFText_GetSchResultIndex(search));
242   EXPECT_EQ(5, FPDFText_GetSchCount(search));
243 
244   // Back to first occurence.
245   EXPECT_TRUE(FPDFText_FindPrev(search));
246   EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
247   EXPECT_EQ(5, FPDFText_GetSchCount(search));
248 
249   // Found position unchanged when fails to retreat.
250   EXPECT_FALSE(FPDFText_FindPrev(search));
251   EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
252   EXPECT_EQ(5, FPDFText_GetSchCount(search));
253   FPDFText_FindClose(search);
254 
255   // Exact search unaffected by case sensitiity and whole word flags.
256   search = FPDFText_FindStart(textpage, world.get(),
257                               FPDF_MATCHCASE | FPDF_MATCHWHOLEWORD, 0);
258   EXPECT_TRUE(search);
259   EXPECT_TRUE(FPDFText_FindNext(search));
260   EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
261   EXPECT_EQ(5, FPDFText_GetSchCount(search));
262   FPDFText_FindClose(search);
263 
264   // Default is case-insensitive, so matching agaist caps works.
265   search = FPDFText_FindStart(textpage, world_caps.get(), 0, 0);
266   EXPECT_TRUE(search);
267   EXPECT_TRUE(FPDFText_FindNext(search));
268   EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
269   EXPECT_EQ(5, FPDFText_GetSchCount(search));
270   FPDFText_FindClose(search);
271 
272   // But can be made case sensitive, in which case this fails.
273   search = FPDFText_FindStart(textpage, world_caps.get(), FPDF_MATCHCASE, 0);
274   EXPECT_FALSE(FPDFText_FindNext(search));
275   EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
276   EXPECT_EQ(0, FPDFText_GetSchCount(search));
277   FPDFText_FindClose(search);
278 
279   // Default is match anywhere within word, so matching substirng works.
280   search = FPDFText_FindStart(textpage, world_substr.get(), 0, 0);
281   EXPECT_TRUE(FPDFText_FindNext(search));
282   EXPECT_EQ(8, FPDFText_GetSchResultIndex(search));
283   EXPECT_EQ(4, FPDFText_GetSchCount(search));
284   FPDFText_FindClose(search);
285 
286   // But can be made to mach word boundaries, in which case this fails.
287   search =
288       FPDFText_FindStart(textpage, world_substr.get(), FPDF_MATCHWHOLEWORD, 0);
289   EXPECT_FALSE(FPDFText_FindNext(search));
290   // TODO(tsepez): investigate strange index/count values in this state.
291   FPDFText_FindClose(search);
292 
293   FPDFText_ClosePage(textpage);
294   UnloadPage(page);
295 }
296 
297 // Test that the page has characters despite a bad stream length.
TEST_F(FPDFTextEmbeddertest,StreamLengthPastEndOfFile)298 TEST_F(FPDFTextEmbeddertest, StreamLengthPastEndOfFile) {
299   EXPECT_TRUE(OpenDocument("bug_57.pdf"));
300   FPDF_PAGE page = LoadPage(0);
301   EXPECT_TRUE(page);
302 
303   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
304   EXPECT_TRUE(textpage);
305   EXPECT_EQ(13, FPDFText_CountChars(textpage));
306 
307   FPDFText_ClosePage(textpage);
308   UnloadPage(page);
309 }
310 
TEST_F(FPDFTextEmbeddertest,WebLinks)311 TEST_F(FPDFTextEmbeddertest, WebLinks) {
312   EXPECT_TRUE(OpenDocument("weblinks.pdf"));
313   FPDF_PAGE page = LoadPage(0);
314   EXPECT_TRUE(page);
315 
316   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
317   EXPECT_TRUE(textpage);
318 
319   FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
320   EXPECT_TRUE(pagelink);
321 
322   // Page contains two HTTP-style URLs.
323   EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink));
324 
325   // Only a terminating NUL required for bogus links.
326   EXPECT_EQ(1, FPDFLink_GetURL(pagelink, 2, nullptr, 0));
327   EXPECT_EQ(1, FPDFLink_GetURL(pagelink, 1400, nullptr, 0));
328   EXPECT_EQ(1, FPDFLink_GetURL(pagelink, -1, nullptr, 0));
329 
330   // Query the number of characters required for each link (incl NUL).
331   EXPECT_EQ(25, FPDFLink_GetURL(pagelink, 0, nullptr, 0));
332   EXPECT_EQ(26, FPDFLink_GetURL(pagelink, 1, nullptr, 0));
333 
334   static const char expected_url[] = "http://example.com?q=foo";
335   static const size_t expected_len = sizeof(expected_url);
336   unsigned short fixed_buffer[128];
337 
338   // Retrieve a link with too small a buffer.  Buffer will not be
339   // NUL-terminated, but must not be modified past indicated length,
340   // so pre-fill with a pattern to check write bounds.
341   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
342   EXPECT_EQ(1, FPDFLink_GetURL(pagelink, 0, fixed_buffer, 1));
343   EXPECT_TRUE(check_unsigned_shorts(expected_url, fixed_buffer, 1));
344   EXPECT_EQ(0xbdbd, fixed_buffer[1]);
345 
346   // Check buffer that doesn't have space for a terminating NUL.
347   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
348   EXPECT_EQ(static_cast<int>(expected_len - 1),
349             FPDFLink_GetURL(pagelink, 0, fixed_buffer, expected_len - 1));
350   EXPECT_TRUE(
351       check_unsigned_shorts(expected_url, fixed_buffer, expected_len - 1));
352   EXPECT_EQ(0xbdbd, fixed_buffer[expected_len - 1]);
353 
354   // Retreive link with exactly-sized buffer.
355   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
356   EXPECT_EQ(static_cast<int>(expected_len),
357             FPDFLink_GetURL(pagelink, 0, fixed_buffer, expected_len));
358   EXPECT_TRUE(check_unsigned_shorts(expected_url, fixed_buffer, expected_len));
359   EXPECT_EQ(0u, fixed_buffer[expected_len - 1]);
360   EXPECT_EQ(0xbdbd, fixed_buffer[expected_len]);
361 
362   // Retreive link with ample-sized-buffer.
363   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
364   EXPECT_EQ(static_cast<int>(expected_len),
365             FPDFLink_GetURL(pagelink, 0, fixed_buffer, 128));
366   EXPECT_TRUE(check_unsigned_shorts(expected_url, fixed_buffer, expected_len));
367   EXPECT_EQ(0u, fixed_buffer[expected_len - 1]);
368   EXPECT_EQ(0xbdbd, fixed_buffer[expected_len]);
369 
370   // Each link rendered in a single rect in this test page.
371   EXPECT_EQ(1, FPDFLink_CountRects(pagelink, 0));
372   EXPECT_EQ(1, FPDFLink_CountRects(pagelink, 1));
373 
374   // Each link rendered in a single rect in this test page.
375   EXPECT_EQ(0, FPDFLink_CountRects(pagelink, -1));
376   EXPECT_EQ(0, FPDFLink_CountRects(pagelink, 2));
377   EXPECT_EQ(0, FPDFLink_CountRects(pagelink, 10000));
378 
379   // Check boundary of valid link index with valid rect index.
380   double left = 0.0;
381   double right = 0.0;
382   double top = 0.0;
383   double bottom = 0.0;
384   EXPECT_TRUE(FPDFLink_GetRect(pagelink, 0, 0, &left, &top, &right, &bottom));
385   EXPECT_NEAR(50.791, left, 0.001);
386   EXPECT_NEAR(187.963, right, 0.001);
387   EXPECT_NEAR(97.624, bottom, 0.001);
388   EXPECT_NEAR(108.736, top, 0.001);
389 
390   // Check that valid link with invalid rect index leaves parameters unchanged.
391   left = -1.0;
392   right = -1.0;
393   top = -1.0;
394   bottom = -1.0;
395   EXPECT_FALSE(FPDFLink_GetRect(pagelink, 0, 1, &left, &top, &right, &bottom));
396   EXPECT_EQ(-1.0, left);
397   EXPECT_EQ(-1.0, right);
398   EXPECT_EQ(-1.0, bottom);
399   EXPECT_EQ(-1.0, top);
400 
401   // Check that invalid link index leaves parameters unchanged.
402   left = -2.0;
403   right = -2.0;
404   top = -2.0;
405   bottom = -2.0;
406   EXPECT_FALSE(FPDFLink_GetRect(pagelink, -1, 0, &left, &top, &right, &bottom));
407   EXPECT_EQ(-2.0, left);
408   EXPECT_EQ(-2.0, right);
409   EXPECT_EQ(-2.0, bottom);
410   EXPECT_EQ(-2.0, top);
411 
412   FPDFLink_CloseWebLinks(pagelink);
413   FPDFText_ClosePage(textpage);
414   UnloadPage(page);
415 }
416 
TEST_F(FPDFTextEmbeddertest,WebLinksAcrossLines)417 TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLines) {
418   EXPECT_TRUE(OpenDocument("weblinks_across_lines.pdf"));
419   FPDF_PAGE page = LoadPage(0);
420   EXPECT_TRUE(page);
421 
422   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
423   EXPECT_TRUE(textpage);
424 
425   FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
426   EXPECT_TRUE(pagelink);
427 
428   static const char* const kExpectedUrls[] = {
429       "http://example.com",           // from "http://www.example.com?\r\nfoo"
430       "http://example.com/",          // from "http://www.example.com/\r\nfoo"
431       "http://example.com/test-foo",  // from "http://example.com/test-\r\nfoo"
432       "http://abc.com/test-foo",      // from "http://abc.com/test-\r\n\r\nfoo"
433       // Next two links from "http://www.example.com/\r\nhttp://www.abc.com/"
434       "http://example.com/", "http://www.abc.com",
435   };
436   static const int kNumLinks = static_cast<int>(FX_ArraySize(kExpectedUrls));
437 
438   EXPECT_EQ(kNumLinks, FPDFLink_CountWebLinks(pagelink));
439 
440   unsigned short fixed_buffer[128];
441   for (int i = 0; i < kNumLinks; i++) {
442     const size_t expected_len = strlen(kExpectedUrls[i]) + 1;
443     memset(fixed_buffer, 0, FX_ArraySize(fixed_buffer));
444     EXPECT_EQ(static_cast<int>(expected_len),
445               FPDFLink_GetURL(pagelink, i, nullptr, 0));
446     EXPECT_EQ(
447         static_cast<int>(expected_len),
448         FPDFLink_GetURL(pagelink, i, fixed_buffer, FX_ArraySize(fixed_buffer)));
449     EXPECT_TRUE(
450         check_unsigned_shorts(kExpectedUrls[i], fixed_buffer, expected_len));
451   }
452 
453   FPDFLink_CloseWebLinks(pagelink);
454   FPDFText_ClosePage(textpage);
455   UnloadPage(page);
456 }
457 
TEST_F(FPDFTextEmbeddertest,WebLinksAcrossLinesBug)458 TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLinesBug) {
459   EXPECT_TRUE(OpenDocument("bug_650.pdf"));
460   FPDF_PAGE page = LoadPage(0);
461   EXPECT_TRUE(page);
462 
463   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
464   EXPECT_TRUE(textpage);
465 
466   FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
467   EXPECT_TRUE(pagelink);
468 
469   EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink));
470   unsigned short fixed_buffer[128] = {0};
471   static const char kExpectedUrl[] =
472       "http://tutorial45.com/learn-autocad-basics-day-166/";
473   static const int kUrlSize = static_cast<int>(sizeof(kExpectedUrl));
474 
475   EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, nullptr, 0));
476   EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, fixed_buffer,
477                                       FX_ArraySize(fixed_buffer)));
478   EXPECT_TRUE(check_unsigned_shorts(kExpectedUrl, fixed_buffer, kUrlSize));
479 
480   FPDFLink_CloseWebLinks(pagelink);
481   FPDFText_ClosePage(textpage);
482   UnloadPage(page);
483 }
484 
TEST_F(FPDFTextEmbeddertest,GetFontSize)485 TEST_F(FPDFTextEmbeddertest, GetFontSize) {
486   EXPECT_TRUE(OpenDocument("hello_world.pdf"));
487   FPDF_PAGE page = LoadPage(0);
488   EXPECT_TRUE(page);
489 
490   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
491   EXPECT_TRUE(textpage);
492 
493   const double kExpectedFontsSizes[] = {12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
494                                         12, 12, 12, 1,  1,  16, 16, 16, 16, 16,
495                                         16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
496 
497   int count = FPDFText_CountChars(textpage);
498   ASSERT_EQ(FX_ArraySize(kExpectedFontsSizes), static_cast<size_t>(count));
499   for (int i = 0; i < count; ++i)
500     EXPECT_EQ(kExpectedFontsSizes[i], FPDFText_GetFontSize(textpage, i)) << i;
501 
502   FPDFText_ClosePage(textpage);
503   UnloadPage(page);
504 }
505 
TEST_F(FPDFTextEmbeddertest,ToUnicode)506 TEST_F(FPDFTextEmbeddertest, ToUnicode) {
507   EXPECT_TRUE(OpenDocument("bug_583.pdf"));
508   FPDF_PAGE page = LoadPage(0);
509   EXPECT_TRUE(page);
510 
511   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
512   EXPECT_TRUE(textpage);
513 
514   ASSERT_EQ(1, FPDFText_CountChars(textpage));
515   EXPECT_EQ(static_cast<unsigned int>(0), FPDFText_GetUnicode(textpage, 0));
516 
517   FPDFText_ClosePage(textpage);
518   UnloadPage(page);
519 }
520 
TEST_F(FPDFTextEmbeddertest,Bug_921)521 TEST_F(FPDFTextEmbeddertest, Bug_921) {
522   EXPECT_TRUE(OpenDocument("bug_921.pdf"));
523   FPDF_PAGE page = LoadPage(0);
524   EXPECT_TRUE(page);
525 
526   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
527   EXPECT_TRUE(textpage);
528 
529   static constexpr unsigned int kData[] = {
530       1095, 1077, 1083, 1086, 1074, 1077, 1095, 1077, 1089, 1082, 1086, 1077,
531       32,   1089, 1090, 1088, 1072, 1076, 1072, 1085, 1080, 1077, 46,   32};
532   static constexpr int kStartIndex = 238;
533 
534   ASSERT_EQ(268, FPDFText_CountChars(textpage));
535   for (size_t i = 0; i < FX_ArraySize(kData); ++i)
536     EXPECT_EQ(kData[i], FPDFText_GetUnicode(textpage, kStartIndex + i));
537 
538   unsigned short buffer[FX_ArraySize(kData) + 1];
539   memset(buffer, 0xbd, sizeof(buffer));
540   int count =
541       FPDFText_GetText(textpage, kStartIndex, FX_ArraySize(kData), buffer);
542   ASSERT_GT(count, 0);
543   ASSERT_EQ(FX_ArraySize(kData) + 1, static_cast<size_t>(count));
544   for (size_t i = 0; i < FX_ArraySize(kData); ++i)
545     EXPECT_EQ(kData[i], buffer[i]);
546   EXPECT_EQ(0, buffer[FX_ArraySize(kData)]);
547 
548   FPDFText_ClosePage(textpage);
549   UnloadPage(page);
550 }
551 
TEST_F(FPDFTextEmbeddertest,GetTextWithHyphen)552 TEST_F(FPDFTextEmbeddertest, GetTextWithHyphen) {
553   EXPECT_TRUE(OpenDocument("bug_781804.pdf"));
554   FPDF_PAGE page = LoadPage(0);
555   EXPECT_TRUE(page);
556 
557   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
558   EXPECT_TRUE(textpage);
559 
560   // Check that soft hyphens are not included
561   // Expecting 'Veritaserum', except there is a \uFFFE where the hyphen was in
562   // the original text. This is a weird thing that Adobe does, which we
563   // replicate.
564   constexpr unsigned short soft_expected[] = {
565       0x0056, 0x0065, 0x0072, 0x0069, 0x0074, 0x0061, 0xfffe,
566       0x0073, 0x0065, 0x0072, 0x0075, 0x006D, 0x0000};
567   {
568     constexpr int count = FX_ArraySize(soft_expected) - 1;
569     unsigned short buffer[FX_ArraySize(soft_expected)];
570     memset(buffer, 0, sizeof(buffer));
571 
572     EXPECT_EQ(count + 1, FPDFText_GetText(textpage, 0, count, buffer));
573     for (int i = 0; i < count; i++)
574       EXPECT_EQ(soft_expected[i], buffer[i]);
575   }
576 
577   // Check that hard hyphens are included
578   {
579     // There isn't the \0 in the actual doc, but there is a \r\n, so need to
580     // add 1 to get aligned.
581     constexpr size_t offset = FX_ArraySize(soft_expected) + 1;
582     // Expecting 'User-\r\ngenerated', the - is a unicode character, so cannnot
583     // store in a char[].
584     constexpr unsigned short hard_expected[] = {
585         0x0055, 0x0073, 0x0065, 0x0072, 0x2010, 0x000d, 0x000a, 0x0067, 0x0065,
586         0x006e, 0x0065, 0x0072, 0x0061, 0x0074, 0x0065, 0x0064, 0x0000};
587     constexpr int count = FX_ArraySize(hard_expected) - 1;
588     unsigned short buffer[FX_ArraySize(hard_expected)];
589 
590     EXPECT_EQ(count + 1, FPDFText_GetText(textpage, offset, count, buffer));
591     for (int i = 0; i < count; i++)
592       EXPECT_EQ(hard_expected[i], buffer[i]);
593   }
594 
595   FPDFText_ClosePage(textpage);
596   UnloadPage(page);
597 }
598 
TEST_F(FPDFTextEmbeddertest,bug_782596)599 TEST_F(FPDFTextEmbeddertest, bug_782596) {
600   // If there is a regression in this test, it will only fail under ASAN
601   EXPECT_TRUE(OpenDocument("bug_782596.pdf"));
602   FPDF_PAGE page = LoadPage(0);
603   EXPECT_TRUE(page);
604   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
605   EXPECT_TRUE(textpage);
606   FPDFText_ClosePage(textpage);
607   UnloadPage(page);
608 }
609 
TEST_F(FPDFTextEmbeddertest,ControlCharacters)610 TEST_F(FPDFTextEmbeddertest, ControlCharacters) {
611   EXPECT_TRUE(OpenDocument("control_characters.pdf"));
612   FPDF_PAGE page = LoadPage(0);
613   EXPECT_TRUE(page);
614 
615   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
616   EXPECT_TRUE(textpage);
617 
618   // Should not include the control characters in the output
619   static const char expected[] = "Hello, world!\r\nGoodbye, world!";
620   unsigned short fixed_buffer[128];
621   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
622   int num_chars = FPDFText_GetText(textpage, 0, 128, fixed_buffer);
623 
624   ASSERT_GE(num_chars, 0);
625   EXPECT_EQ(sizeof(expected), static_cast<size_t>(num_chars));
626   EXPECT_TRUE(check_unsigned_shorts(expected, fixed_buffer, sizeof(expected)));
627 
628   // Attempting to get a chunk of text after the control characters
629   static const char expected_substring[] = "Goodbye, world!";
630   // Offset is the length of 'Hello, world!\r\n' + 2 control characters in the
631   // original stream
632   static const int offset = 17;
633   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
634   num_chars = FPDFText_GetText(textpage, offset, 128, fixed_buffer);
635 
636   ASSERT_GE(num_chars, 0);
637   EXPECT_EQ(sizeof(expected_substring), static_cast<size_t>(num_chars));
638   EXPECT_TRUE(check_unsigned_shorts(expected_substring, fixed_buffer,
639                                     sizeof(expected_substring)));
640 
641   FPDFText_ClosePage(textpage);
642   UnloadPage(page);
643 }
644