1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdftext/cpdf_linkextract.h"
8 
9 #include <vector>
10 
11 #include "core/fpdftext/cpdf_textpage.h"
12 #include "core/fxcrt/fx_extension.h"
13 #include "core/fxcrt/fx_string.h"
14 #include "core/fxcrt/fx_system.h"
15 
16 namespace {
17 
18 // Find the end of a web link starting from offset |start| and ending at offset
19 // |end|. The purpose of this function is to separate url from the surrounding
20 // context characters, we do not intend to fully validate the url. |str|
21 // contains lower case characters only.
FindWebLinkEnding(const WideString & str,size_t start,size_t end)22 size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) {
23   if (str.Contains(L'/', start)) {
24     // When there is a path and query after '/', most ASCII chars are allowed.
25     // We don't sanitize in this case.
26     return end;
27   }
28 
29   // When there is no path, it only has IP address or host name.
30   // Port is optional at the end.
31   if (str[start] == L'[') {
32     // IPv6 reference.
33     // Find the end of the reference.
34     auto result = str.Find(L']', start + 1);
35     if (result.has_value()) {
36       end = result.value();
37       if (end > start + 1) {  // Has content inside brackets.
38         size_t len = str.GetLength();
39         size_t off = end + 1;
40         if (off < len && str[off] == L':') {
41           off++;
42           while (off < len && str[off] >= L'0' && str[off] <= L'9')
43             off++;
44           if (off > end + 2 &&
45               off <= len)   // At least one digit in port number.
46             end = off - 1;  // |off| is offset of the first invalid char.
47         }
48       }
49     }
50     return end;
51   }
52 
53   // According to RFC1123, host name only has alphanumeric chars, hyphens,
54   // and periods. Hyphen should not at the end though.
55   // Non-ASCII chars are ignored during checking.
56   while (end > start && str[end] < 0x80) {
57     if ((str[end] >= L'0' && str[end] <= L'9') ||
58         (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.')
59       break;
60     end--;
61   }
62   return end;
63 }
64 
65 // Remove characters from the end of |str|, delimited by |start| and |end|, up
66 // to and including |charToFind|. No-op if |charToFind| is not present. Updates
67 // |end| if characters were removed.
TrimBackwardsToChar(const WideString & str,wchar_t charToFind,size_t start,size_t * end)68 void TrimBackwardsToChar(const WideString& str,
69                          wchar_t charToFind,
70                          size_t start,
71                          size_t* end) {
72   for (size_t pos = *end; pos >= start; pos--) {
73     if (str[pos] == charToFind) {
74       *end = pos - 1;
75       break;
76     }
77   }
78 }
79 
80 // Finds opening brackets ()[]{}<> and quotes "'  before the URL delimited by
81 // |start| and |end| in |str|. Matches a closing bracket or quote for each
82 // opening character and, if present, removes everything afterwards. Returns the
83 // new end position for the string.
TrimExternalBracketsFromWebLink(const WideString & str,size_t start,size_t end)84 size_t TrimExternalBracketsFromWebLink(const WideString& str,
85                                        size_t start,
86                                        size_t end) {
87   for (size_t pos = 0; pos < start; pos++) {
88     if (str[pos] == '(') {
89       TrimBackwardsToChar(str, ')', start, &end);
90     } else if (str[pos] == '[') {
91       TrimBackwardsToChar(str, ']', start, &end);
92     } else if (str[pos] == '{') {
93       TrimBackwardsToChar(str, '}', start, &end);
94     } else if (str[pos] == '<') {
95       TrimBackwardsToChar(str, '>', start, &end);
96     } else if (str[pos] == '"') {
97       TrimBackwardsToChar(str, '"', start, &end);
98     } else if (str[pos] == '\'') {
99       TrimBackwardsToChar(str, '\'', start, &end);
100     }
101   }
102   return end;
103 }
104 
105 }  // namespace
106 
CPDF_LinkExtract(const CPDF_TextPage * pTextPage)107 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
108     : m_pTextPage(pTextPage) {}
109 
~CPDF_LinkExtract()110 CPDF_LinkExtract::~CPDF_LinkExtract() {}
111 
ExtractLinks()112 void CPDF_LinkExtract::ExtractLinks() {
113   m_LinkArray.clear();
114   if (!m_pTextPage->IsParsed())
115     return;
116 
117   m_strPageText = m_pTextPage->GetAllPageText();
118   if (m_strPageText.IsEmpty())
119     return;
120 
121   ParseLink();
122 }
123 
ParseLink()124 void CPDF_LinkExtract::ParseLink() {
125   int start = 0;
126   int pos = 0;
127   int nTotalChar = m_pTextPage->CountChars();
128   bool bAfterHyphen = false;
129   bool bLineBreak = false;
130   while (pos < nTotalChar) {
131     FPDF_CHAR_INFO pageChar;
132     m_pTextPage->GetCharInfo(pos, &pageChar);
133     if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
134         pageChar.m_Unicode == TEXT_SPACE_CHAR || pos == nTotalChar - 1) {
135       int nCount = pos - start;
136       if (pos == nTotalChar - 1) {
137         nCount++;
138       } else if (bAfterHyphen && (pageChar.m_Unicode == TEXT_LINEFEED_CHAR ||
139                                   pageChar.m_Unicode == TEXT_RETURN_CHAR)) {
140         // Handle text breaks with a hyphen to the next line.
141         bLineBreak = true;
142         pos++;
143         continue;
144       }
145       WideString strBeCheck;
146       strBeCheck = m_pTextPage->GetPageText(start, nCount);
147       if (bLineBreak) {
148         strBeCheck.Remove(TEXT_LINEFEED_CHAR);
149         strBeCheck.Remove(TEXT_RETURN_CHAR);
150         bLineBreak = false;
151       }
152       // Replace the generated code with the hyphen char.
153       strBeCheck.Replace(L"\xfffe", TEXT_HYPHEN);
154 
155       if (strBeCheck.GetLength() > 5) {
156         while (strBeCheck.GetLength() > 0) {
157           wchar_t ch = strBeCheck[strBeCheck.GetLength() - 1];
158           if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
159             strBeCheck = strBeCheck.Left(strBeCheck.GetLength() - 1);
160             nCount--;
161           } else {
162             break;
163           }
164         }
165         // Check for potential web URLs and email addresses.
166         // Ftp address, file system links, data, blob etc. are not checked.
167         if (nCount > 5) {
168           int32_t nStartOffset;
169           int32_t nCountOverload;
170           if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) {
171             m_LinkArray.push_back(
172                 {start + nStartOffset, nCountOverload, strBeCheck});
173           } else if (CheckMailLink(&strBeCheck)) {
174             m_LinkArray.push_back({start, nCount, strBeCheck});
175           }
176         }
177       }
178       start = ++pos;
179     } else {
180       bAfterHyphen = (pageChar.m_Flag == FPDFTEXT_CHAR_HYPHEN ||
181                       (pageChar.m_Flag == FPDFTEXT_CHAR_NORMAL &&
182                        pageChar.m_Unicode == TEXT_HYPHEN_CHAR));
183       pos++;
184     }
185   }
186 }
187 
CheckWebLink(WideString * strBeCheck,int32_t * nStart,int32_t * nCount)188 bool CPDF_LinkExtract::CheckWebLink(WideString* strBeCheck,
189                                     int32_t* nStart,
190                                     int32_t* nCount) {
191   static const wchar_t kHttpScheme[] = L"http";
192   static const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme);
193   static const wchar_t kWWWAddrStart[] = L"www.";
194   static const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
195 
196   WideString str = *strBeCheck;
197   str.MakeLower();
198 
199   size_t len = str.GetLength();
200   // First, try to find the scheme.
201   auto start = str.Find(kHttpScheme);
202   if (start.has_value()) {
203     size_t off = start.value() + kHttpSchemeLen;  // move after "http".
204     if (len > off + 4) {                      // At least "://<char>" follows.
205       if (str[off] == L's')                   // "https" scheme is accepted.
206         off++;
207       if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
208         off += 3;
209         size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
210                                                      str.GetLength() - 1);
211         end = FindWebLinkEnding(str, off, end);
212         if (end > off) {  // Non-empty host name.
213           *nStart = start.value();
214           *nCount = end - start.value() + 1;
215           *strBeCheck = strBeCheck->Mid(*nStart, *nCount);
216           return true;
217         }
218       }
219     }
220   }
221 
222   // When there is no scheme, try to find url starting with "www.".
223   start = str.Find(kWWWAddrStart);
224   if (start.has_value() && len > start.value() + kWWWAddrStartLen) {
225     size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
226                                                  str.GetLength() - 1);
227     end = FindWebLinkEnding(str, start.value(), end);
228     if (end > start.value() + kWWWAddrStartLen) {
229       *nStart = start.value();
230       *nCount = end - start.value() + 1;
231       *strBeCheck = L"http://" + strBeCheck->Mid(*nStart, *nCount);
232       return true;
233     }
234   }
235   return false;
236 }
237 
CheckMailLink(WideString * str)238 bool CPDF_LinkExtract::CheckMailLink(WideString* str) {
239   auto aPos = str->Find(L'@');
240   // Invalid when no '@' or when starts/ends with '@'.
241   if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1)
242     return false;
243 
244   // Check the local part.
245   size_t pPos = aPos.value();  // Used to track the position of '@' or '.'.
246   for (size_t i = aPos.value(); i > 0; i--) {
247     wchar_t ch = (*str)[i - 1];
248     if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
249       continue;
250 
251     if (ch != L'.' || i == pPos || i == 1) {
252       if (i == aPos.value()) {
253         // There is '.' or invalid char before '@'.
254         return false;
255       }
256       // End extracting for other invalid chars, '.' at the beginning, or
257       // consecutive '.'.
258       size_t removed_len = i == pPos ? i + 1 : i;
259       *str = str->Right(str->GetLength() - removed_len);
260       break;
261     }
262     // Found a valid '.'.
263     pPos = i - 1;
264   }
265 
266   // Check the domain name part.
267   aPos = str->Find(L'@');
268   if (!aPos.has_value() || aPos.value() == 0)
269     return false;
270 
271   str->TrimRight(L'.');
272   // At least one '.' in domain name, but not at the beginning.
273   // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
274   // Check whether we should remove this check.
275   auto ePos = str->Find(L'.', aPos.value() + 1);
276   if (!ePos.has_value() || ePos.value() == aPos.value() + 1)
277     return false;
278 
279   // Validate all other chars in domain name.
280   size_t nLen = str->GetLength();
281   pPos = 0;  // Used to track the position of '.'.
282   for (size_t i = aPos.value() + 1; i < nLen; i++) {
283     wchar_t wch = (*str)[i];
284     if (wch == L'-' || FXSYS_iswalnum(wch))
285       continue;
286 
287     if (wch != L'.' || i == pPos + 1) {
288       // Domain name should end before invalid char.
289       size_t host_end = i == pPos + 1 ? i - 2 : i - 1;
290       if (pPos > 0 && host_end - aPos.value() >= 3) {
291         // Trim the ending invalid chars if there is at least one '.' and name.
292         *str = str->Left(host_end + 1);
293         break;
294       }
295       return false;
296     }
297     pPos = i;
298   }
299 
300   if (!str->Contains(L"mailto:"))
301     *str = L"mailto:" + *str;
302 
303   return true;
304 }
305 
GetURL(size_t index) const306 WideString CPDF_LinkExtract::GetURL(size_t index) const {
307   return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
308 }
309 
GetRects(size_t index) const310 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
311   if (index >= m_LinkArray.size())
312     return std::vector<CFX_FloatRect>();
313 
314   return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
315                                    m_LinkArray[index].m_Count);
316 }
317