1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdftext/cpdf_linkextract.h"
8
9 #include <vector>
10
11 #include "core/fpdftext/cpdf_textpage.h"
12 #include "core/fxcrt/fx_ext.h"
13 #include "core/fxcrt/fx_string.h"
14 #include "core/fxcrt/fx_system.h"
15
CPDF_LinkExtract(const CPDF_TextPage * pTextPage)16 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
17 : m_pTextPage(pTextPage) {}
18
~CPDF_LinkExtract()19 CPDF_LinkExtract::~CPDF_LinkExtract() {}
20
ExtractLinks()21 void CPDF_LinkExtract::ExtractLinks() {
22 m_LinkArray.clear();
23 if (!m_pTextPage->IsParsed())
24 return;
25
26 m_strPageText = m_pTextPage->GetPageText(0, -1);
27 if (m_strPageText.IsEmpty())
28 return;
29
30 ParseLink();
31 }
32
ParseLink()33 void CPDF_LinkExtract::ParseLink() {
34 int start = 0, pos = 0;
35 int TotalChar = m_pTextPage->CountChars();
36 while (pos < TotalChar) {
37 FPDF_CHAR_INFO pageChar;
38 m_pTextPage->GetCharInfo(pos, &pageChar);
39 if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
40 pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
41 int nCount = pos - start;
42 if (pos == TotalChar - 1)
43 nCount++;
44 CFX_WideString strBeCheck;
45 strBeCheck = m_pTextPage->GetPageText(start, nCount);
46 if (strBeCheck.GetLength() > 5) {
47 while (strBeCheck.GetLength() > 0) {
48 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
49 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
50 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
51 nCount--;
52 } else {
53 break;
54 }
55 }
56 if (nCount > 5 &&
57 (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
58 m_LinkArray.push_back({start, nCount, strBeCheck});
59 }
60 }
61 start = ++pos;
62 } else {
63 pos++;
64 }
65 }
66 }
67
CheckWebLink(CFX_WideString & strBeCheck)68 bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
69 CFX_WideString str = strBeCheck;
70 str.MakeLower();
71 if (str.Find(L"http://www.") != -1) {
72 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
73 return true;
74 }
75 if (str.Find(L"http://") != -1) {
76 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
77 return true;
78 }
79 if (str.Find(L"https://www.") != -1) {
80 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
81 return true;
82 }
83 if (str.Find(L"https://") != -1) {
84 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
85 return true;
86 }
87 if (str.Find(L"www.") != -1) {
88 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
89 strBeCheck = L"http://" + strBeCheck;
90 return true;
91 }
92 return false;
93 }
94
CheckMailLink(CFX_WideString & str)95 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
96 int aPos = str.Find(L'@');
97 // Invalid when no '@'.
98 if (aPos < 1)
99 return false;
100
101 // Check the local part.
102 int pPos = aPos; // Used to track the position of '@' or '.'.
103 for (int i = aPos - 1; i >= 0; i--) {
104 FX_WCHAR ch = str.GetAt(i);
105 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
106 continue;
107
108 if (ch != L'.' || i == pPos - 1 || i == 0) {
109 if (i == aPos - 1) {
110 // There is '.' or invalid char before '@'.
111 return false;
112 }
113 // End extracting for other invalid chars, '.' at the beginning, or
114 // consecutive '.'.
115 int removed_len = i == pPos - 1 ? i + 2 : i + 1;
116 str = str.Right(str.GetLength() - removed_len);
117 break;
118 }
119 // Found a valid '.'.
120 pPos = i;
121 }
122
123 // Check the domain name part.
124 aPos = str.Find(L'@');
125 if (aPos < 1)
126 return false;
127
128 str.TrimRight(L'.');
129 // At least one '.' in domain name, but not at the beginning.
130 // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
131 // Check whether we should remove this check.
132 int ePos = str.Find(L'.', aPos + 1);
133 if (ePos == -1 || ePos == aPos + 1)
134 return false;
135
136 // Validate all other chars in domain name.
137 int nLen = str.GetLength();
138 pPos = 0; // Used to track the position of '.'.
139 for (int i = aPos + 1; i < nLen; i++) {
140 FX_WCHAR wch = str.GetAt(i);
141 if (wch == L'-' || FXSYS_iswalnum(wch))
142 continue;
143
144 if (wch != L'.' || i == pPos + 1) {
145 // Domain name should end before invalid char.
146 int host_end = i == pPos + 1 ? i - 2 : i - 1;
147 if (pPos > 0 && host_end - aPos >= 3) {
148 // Trim the ending invalid chars if there is at least one '.' and name.
149 str = str.Left(host_end + 1);
150 break;
151 }
152 return false;
153 }
154 pPos = i;
155 }
156
157 if (str.Find(L"mailto:") == -1)
158 str = L"mailto:" + str;
159
160 return true;
161 }
162
GetURL(size_t index) const163 CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {
164 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
165 }
166
GetRects(size_t index) const167 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
168 if (index >= m_LinkArray.size())
169 return std::vector<CFX_FloatRect>();
170
171 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
172 m_LinkArray[index].m_Count);
173 }
174