1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdftext/cpdf_linkextract.h"
8 
9 #include <vector>
10 
11 #include "core/fpdftext/cpdf_textpage.h"
12 #include "core/fxcrt/fx_ext.h"
13 #include "core/fxcrt/fx_string.h"
14 #include "core/fxcrt/fx_system.h"
15 
CPDF_LinkExtract(const CPDF_TextPage * pTextPage)16 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
17     : m_pTextPage(pTextPage) {}
18 
~CPDF_LinkExtract()19 CPDF_LinkExtract::~CPDF_LinkExtract() {}
20 
ExtractLinks()21 void CPDF_LinkExtract::ExtractLinks() {
22   m_LinkArray.clear();
23   if (!m_pTextPage->IsParsed())
24     return;
25 
26   m_strPageText = m_pTextPage->GetPageText(0, -1);
27   if (m_strPageText.IsEmpty())
28     return;
29 
30   ParseLink();
31 }
32 
ParseLink()33 void CPDF_LinkExtract::ParseLink() {
34   int start = 0, pos = 0;
35   int TotalChar = m_pTextPage->CountChars();
36   while (pos < TotalChar) {
37     FPDF_CHAR_INFO pageChar;
38     m_pTextPage->GetCharInfo(pos, &pageChar);
39     if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
40         pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
41       int nCount = pos - start;
42       if (pos == TotalChar - 1)
43         nCount++;
44       CFX_WideString strBeCheck;
45       strBeCheck = m_pTextPage->GetPageText(start, nCount);
46       if (strBeCheck.GetLength() > 5) {
47         while (strBeCheck.GetLength() > 0) {
48           FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
49           if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
50             strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
51             nCount--;
52           } else {
53             break;
54           }
55         }
56         if (nCount > 5 &&
57             (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
58           m_LinkArray.push_back({start, nCount, strBeCheck});
59         }
60       }
61       start = ++pos;
62     } else {
63       pos++;
64     }
65   }
66 }
67 
CheckWebLink(CFX_WideString & strBeCheck)68 bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
69   CFX_WideString str = strBeCheck;
70   str.MakeLower();
71   if (str.Find(L"http://www.") != -1) {
72     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
73     return true;
74   }
75   if (str.Find(L"http://") != -1) {
76     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
77     return true;
78   }
79   if (str.Find(L"https://www.") != -1) {
80     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
81     return true;
82   }
83   if (str.Find(L"https://") != -1) {
84     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
85     return true;
86   }
87   if (str.Find(L"www.") != -1) {
88     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
89     strBeCheck = L"http://" + strBeCheck;
90     return true;
91   }
92   return false;
93 }
94 
CheckMailLink(CFX_WideString & str)95 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
96   int aPos = str.Find(L'@');
97   // Invalid when no '@'.
98   if (aPos < 1)
99     return false;
100 
101   // Check the local part.
102   int pPos = aPos;  // Used to track the position of '@' or '.'.
103   for (int i = aPos - 1; i >= 0; i--) {
104     FX_WCHAR ch = str.GetAt(i);
105     if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
106       continue;
107 
108     if (ch != L'.' || i == pPos - 1 || i == 0) {
109       if (i == aPos - 1) {
110         // There is '.' or invalid char before '@'.
111         return false;
112       }
113       // End extracting for other invalid chars, '.' at the beginning, or
114       // consecutive '.'.
115       int removed_len = i == pPos - 1 ? i + 2 : i + 1;
116       str = str.Right(str.GetLength() - removed_len);
117       break;
118     }
119     // Found a valid '.'.
120     pPos = i;
121   }
122 
123   // Check the domain name part.
124   aPos = str.Find(L'@');
125   if (aPos < 1)
126     return false;
127 
128   str.TrimRight(L'.');
129   // At least one '.' in domain name, but not at the beginning.
130   // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
131   // Check whether we should remove this check.
132   int ePos = str.Find(L'.', aPos + 1);
133   if (ePos == -1 || ePos == aPos + 1)
134     return false;
135 
136   // Validate all other chars in domain name.
137   int nLen = str.GetLength();
138   pPos = 0;  // Used to track the position of '.'.
139   for (int i = aPos + 1; i < nLen; i++) {
140     FX_WCHAR wch = str.GetAt(i);
141     if (wch == L'-' || FXSYS_iswalnum(wch))
142       continue;
143 
144     if (wch != L'.' || i == pPos + 1) {
145       // Domain name should end before invalid char.
146       int host_end = i == pPos + 1 ? i - 2 : i - 1;
147       if (pPos > 0 && host_end - aPos >= 3) {
148         // Trim the ending invalid chars if there is at least one '.' and name.
149         str = str.Left(host_end + 1);
150         break;
151       }
152       return false;
153     }
154     pPos = i;
155   }
156 
157   if (str.Find(L"mailto:") == -1)
158     str = L"mailto:" + str;
159 
160   return true;
161 }
162 
GetURL(size_t index) const163 CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {
164   return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
165 }
166 
GetRects(size_t index) const167 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
168   if (index >= m_LinkArray.size())
169     return std::vector<CFX_FloatRect>();
170 
171   return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
172                                    m_LinkArray[index].m_Count);
173 }
174