1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/page/cpdf_textobject.h"
8 
9 #include <algorithm>
10 #include <utility>
11 
12 #include "core/fpdfapi/font/cpdf_cidfont.h"
13 #include "core/fpdfapi/font/cpdf_font.h"
14 #include "third_party/base/ptr_util.h"
15 
16 #define ISLATINWORD(u) (u != 0x20 && u <= 0x28FF)
17 
CPDF_TextObjectItem()18 CPDF_TextObjectItem::CPDF_TextObjectItem() : m_CharCode(0) {}
19 
20 CPDF_TextObjectItem::~CPDF_TextObjectItem() = default;
21 
CPDF_TextObject(int32_t content_stream)22 CPDF_TextObject::CPDF_TextObject(int32_t content_stream)
23     : CPDF_PageObject(content_stream) {}
24 
CPDF_TextObject()25 CPDF_TextObject::CPDF_TextObject() : CPDF_TextObject(kNoContentStream) {}
26 
~CPDF_TextObject()27 CPDF_TextObject::~CPDF_TextObject() {
28   // Move m_CharCodes to a local variable so it will be captured in crash dumps,
29   // to help with investigating crbug.com/782215.
30   auto char_codes_copy = std::move(m_CharCodes);
31 }
32 
CountItems() const33 size_t CPDF_TextObject::CountItems() const {
34   return m_CharCodes.size();
35 }
36 
GetItemInfo(size_t index,CPDF_TextObjectItem * pInfo) const37 void CPDF_TextObject::GetItemInfo(size_t index,
38                                   CPDF_TextObjectItem* pInfo) const {
39   ASSERT(index < m_CharCodes.size());
40   pInfo->m_CharCode = m_CharCodes[index];
41   pInfo->m_Origin = CFX_PointF(index > 0 ? m_CharPos[index - 1] : 0, 0);
42   if (pInfo->m_CharCode == CPDF_Font::kInvalidCharCode)
43     return;
44 
45   RetainPtr<CPDF_Font> pFont = GetFont();
46   if (!pFont->IsCIDFont() || !pFont->AsCIDFont()->IsVertWriting())
47     return;
48 
49   uint16_t CID = pFont->AsCIDFont()->CIDFromCharCode(pInfo->m_CharCode);
50   pInfo->m_Origin = CFX_PointF(0, pInfo->m_Origin.x);
51 
52   short vx;
53   short vy;
54   pFont->AsCIDFont()->GetVertOrigin(CID, vx, vy);
55 
56   float fontsize = GetFontSize();
57   pInfo->m_Origin.x -= fontsize * vx / 1000;
58   pInfo->m_Origin.y -= fontsize * vy / 1000;
59 }
60 
CountChars() const61 size_t CPDF_TextObject::CountChars() const {
62   size_t count = 0;
63   for (uint32_t charcode : m_CharCodes) {
64     if (charcode != CPDF_Font::kInvalidCharCode)
65       ++count;
66   }
67   return count;
68 }
69 
GetCharInfo(size_t index,uint32_t * charcode,float * kerning) const70 void CPDF_TextObject::GetCharInfo(size_t index,
71                                   uint32_t* charcode,
72                                   float* kerning) const {
73   size_t count = 0;
74   for (size_t i = 0; i < m_CharCodes.size(); ++i) {
75     if (m_CharCodes[i] == CPDF_Font::kInvalidCharCode)
76       continue;
77     if (count++ != index)
78       continue;
79     *charcode = m_CharCodes[i];
80     if (i == m_CharCodes.size() - 1 ||
81         m_CharCodes[i + 1] != CPDF_Font::kInvalidCharCode) {
82       *kerning = 0;
83     } else {
84       *kerning = m_CharPos[i];
85     }
86     return;
87   }
88 }
89 
GetCharInfo(size_t index,CPDF_TextObjectItem * pInfo) const90 void CPDF_TextObject::GetCharInfo(size_t index,
91                                   CPDF_TextObjectItem* pInfo) const {
92   size_t count = 0;
93   for (size_t i = 0; i < m_CharCodes.size(); ++i) {
94     uint32_t charcode = m_CharCodes[i];
95     if (charcode == CPDF_Font::kInvalidCharCode)
96       continue;
97     if (count++ != index)
98       continue;
99     GetItemInfo(i, pInfo);
100     break;
101   }
102 }
103 
CountWords() const104 int CPDF_TextObject::CountWords() const {
105   RetainPtr<CPDF_Font> pFont = GetFont();
106   bool bInLatinWord = false;
107   int nWords = 0;
108   for (size_t i = 0, sz = CountChars(); i < sz; ++i) {
109     uint32_t charcode = CPDF_Font::kInvalidCharCode;
110     float unused_kerning;
111     GetCharInfo(i, &charcode, &unused_kerning);
112 
113     WideString swUnicode = pFont->UnicodeFromCharCode(charcode);
114     uint16_t unicode = 0;
115     if (swUnicode.GetLength() > 0)
116       unicode = swUnicode[0];
117 
118     bool bIsLatin = ISLATINWORD(unicode);
119     if (bIsLatin && bInLatinWord)
120       continue;
121 
122     bInLatinWord = bIsLatin;
123     if (unicode != 0x20)
124       nWords++;
125   }
126 
127   return nWords;
128 }
129 
GetWordString(int nWordIndex) const130 WideString CPDF_TextObject::GetWordString(int nWordIndex) const {
131   RetainPtr<CPDF_Font> pFont = GetFont();
132   WideString swRet;
133   int nWords = 0;
134   bool bInLatinWord = false;
135   for (size_t i = 0, sz = CountChars(); i < sz; ++i) {
136     uint32_t charcode = CPDF_Font::kInvalidCharCode;
137     float unused_kerning;
138     GetCharInfo(i, &charcode, &unused_kerning);
139 
140     WideString swUnicode = pFont->UnicodeFromCharCode(charcode);
141     uint16_t unicode = 0;
142     if (swUnicode.GetLength() > 0)
143       unicode = swUnicode[0];
144 
145     bool bIsLatin = ISLATINWORD(unicode);
146     if (!bIsLatin || !bInLatinWord) {
147       bInLatinWord = bIsLatin;
148       if (unicode != 0x20)
149         nWords++;
150     }
151     if (nWords - 1 == nWordIndex)
152       swRet += unicode;
153   }
154   return swRet;
155 }
156 
Clone() const157 std::unique_ptr<CPDF_TextObject> CPDF_TextObject::Clone() const {
158   auto obj = pdfium::MakeUnique<CPDF_TextObject>();
159   obj->CopyData(this);
160   obj->m_CharCodes = m_CharCodes;
161   obj->m_CharPos = m_CharPos;
162   obj->m_Pos = m_Pos;
163   return obj;
164 }
165 
GetType() const166 CPDF_PageObject::Type CPDF_TextObject::GetType() const {
167   return TEXT;
168 }
169 
Transform(const CFX_Matrix & matrix)170 void CPDF_TextObject::Transform(const CFX_Matrix& matrix) {
171   CFX_Matrix text_matrix = GetTextMatrix() * matrix;
172 
173   float* pTextMatrix = m_TextState.GetMutableMatrix();
174   pTextMatrix[0] = text_matrix.a;
175   pTextMatrix[1] = text_matrix.c;
176   pTextMatrix[2] = text_matrix.b;
177   pTextMatrix[3] = text_matrix.d;
178   m_Pos = CFX_PointF(text_matrix.e, text_matrix.f);
179   CalcPositionData(0);
180   SetDirty(true);
181 }
182 
IsText() const183 bool CPDF_TextObject::IsText() const {
184   return true;
185 }
186 
AsText()187 CPDF_TextObject* CPDF_TextObject::AsText() {
188   return this;
189 }
190 
AsText() const191 const CPDF_TextObject* CPDF_TextObject::AsText() const {
192   return this;
193 }
194 
GetTextMatrix() const195 CFX_Matrix CPDF_TextObject::GetTextMatrix() const {
196   const float* pTextMatrix = m_TextState.GetMatrix();
197   return CFX_Matrix(pTextMatrix[0], pTextMatrix[2], pTextMatrix[1],
198                     pTextMatrix[3], m_Pos.x, m_Pos.y);
199 }
200 
SetSegments(const ByteString * pStrs,const std::vector<float> & kernings,size_t nSegs)201 void CPDF_TextObject::SetSegments(const ByteString* pStrs,
202                                   const std::vector<float>& kernings,
203                                   size_t nSegs) {
204   m_CharCodes.clear();
205   m_CharPos.clear();
206   RetainPtr<CPDF_Font> pFont = GetFont();
207   int nChars = 0;
208   for (size_t i = 0; i < nSegs; ++i)
209     nChars += pFont->CountChar(pStrs[i].AsStringView());
210   nChars += nSegs - 1;
211   m_CharCodes.resize(nChars);
212   m_CharPos.resize(nChars - 1);
213   size_t index = 0;
214   for (size_t i = 0; i < nSegs; ++i) {
215     ByteStringView segment = pStrs[i].AsStringView();
216     size_t offset = 0;
217     while (offset < segment.GetLength()) {
218       ASSERT(index < m_CharCodes.size());
219       m_CharCodes[index++] = pFont->GetNextChar(segment, &offset);
220     }
221     if (i != nSegs - 1) {
222       m_CharPos[index - 1] = kernings[i];
223       m_CharCodes[index++] = CPDF_Font::kInvalidCharCode;
224     }
225   }
226 }
227 
SetText(const ByteString & str)228 void CPDF_TextObject::SetText(const ByteString& str) {
229   SetSegments(&str, std::vector<float>(), 1);
230   RecalcPositionData();
231   SetDirty(true);
232 }
233 
GetCharWidth(uint32_t charcode) const234 float CPDF_TextObject::GetCharWidth(uint32_t charcode) const {
235   float fontsize = GetFontSize() / 1000;
236   RetainPtr<CPDF_Font> pFont = GetFont();
237   bool bVertWriting = false;
238   CPDF_CIDFont* pCIDFont = pFont->AsCIDFont();
239   if (pCIDFont)
240     bVertWriting = pCIDFont->IsVertWriting();
241   if (!bVertWriting)
242     return pFont->GetCharWidthF(charcode) * fontsize;
243 
244   uint16_t CID = pCIDFont->CIDFromCharCode(charcode);
245   return pCIDFont->GetVertWidth(CID) * fontsize;
246 }
247 
GetFont() const248 RetainPtr<CPDF_Font> CPDF_TextObject::GetFont() const {
249   return m_TextState.GetFont();
250 }
251 
GetFontSize() const252 float CPDF_TextObject::GetFontSize() const {
253   return m_TextState.GetFontSize();
254 }
255 
GetTextRenderMode() const256 TextRenderingMode CPDF_TextObject::GetTextRenderMode() const {
257   return m_TextState.GetTextMode();
258 }
259 
CalcPositionData(float horz_scale)260 CFX_PointF CPDF_TextObject::CalcPositionData(float horz_scale) {
261   float curpos = 0;
262   float min_x = 10000 * 1.0f;
263   float max_x = -10000 * 1.0f;
264   float min_y = 10000 * 1.0f;
265   float max_y = -10000 * 1.0f;
266   RetainPtr<CPDF_Font> pFont = GetFont();
267   bool bVertWriting = false;
268   CPDF_CIDFont* pCIDFont = pFont->AsCIDFont();
269   if (pCIDFont)
270     bVertWriting = pCIDFont->IsVertWriting();
271 
272   float fontsize = GetFontSize();
273   for (size_t i = 0; i < m_CharCodes.size(); ++i) {
274     uint32_t charcode = m_CharCodes[i];
275     if (i > 0) {
276       if (charcode == CPDF_Font::kInvalidCharCode) {
277         curpos -= (m_CharPos[i - 1] * fontsize) / 1000;
278         continue;
279       }
280       m_CharPos[i - 1] = curpos;
281     }
282 
283     FX_RECT char_rect = pFont->GetCharBBox(charcode);
284     float charwidth;
285     if (!bVertWriting) {
286       min_y = std::min(
287           min_y, static_cast<float>(std::min(char_rect.top, char_rect.bottom)));
288       max_y = std::max(
289           max_y, static_cast<float>(std::max(char_rect.top, char_rect.bottom)));
290       float char_left = curpos + char_rect.left * fontsize / 1000;
291       float char_right = curpos + char_rect.right * fontsize / 1000;
292       min_x = std::min(min_x, std::min(char_left, char_right));
293       max_x = std::max(max_x, std::max(char_left, char_right));
294       charwidth = pFont->GetCharWidthF(charcode) * fontsize / 1000;
295     } else {
296       uint16_t CID = pCIDFont->CIDFromCharCode(charcode);
297       short vx;
298       short vy;
299       pCIDFont->GetVertOrigin(CID, vx, vy);
300       char_rect.left -= vx;
301       char_rect.right -= vx;
302       char_rect.top -= vy;
303       char_rect.bottom -= vy;
304       min_x = std::min(
305           min_x, static_cast<float>(std::min(char_rect.left, char_rect.right)));
306       max_x = std::max(
307           max_x, static_cast<float>(std::max(char_rect.left, char_rect.right)));
308       float char_top = curpos + char_rect.top * fontsize / 1000;
309       float char_bottom = curpos + char_rect.bottom * fontsize / 1000;
310       min_y = std::min(min_y, std::min(char_top, char_bottom));
311       max_y = std::max(max_y, std::max(char_top, char_bottom));
312       charwidth = pCIDFont->GetVertWidth(CID) * fontsize / 1000;
313     }
314     curpos += charwidth;
315     if (charcode == ' ' && (!pCIDFont || pCIDFont->GetCharSize(' ') == 1))
316       curpos += m_TextState.GetWordSpace();
317 
318     curpos += m_TextState.GetCharSpace();
319   }
320 
321   CFX_PointF ret;
322   if (bVertWriting) {
323     ret.y = curpos;
324     min_x = min_x * fontsize / 1000;
325     max_x = max_x * fontsize / 1000;
326   } else {
327     ret.x = curpos * horz_scale;
328     min_y = min_y * fontsize / 1000;
329     max_y = max_y * fontsize / 1000;
330   }
331   SetRect(
332       GetTextMatrix().TransformRect(CFX_FloatRect(min_x, min_y, max_x, max_y)));
333 
334   if (!TextRenderingModeIsStrokeMode(m_TextState.GetTextMode()))
335     return ret;
336 
337   float half_width = m_GraphState.GetLineWidth() / 2;
338   m_Rect.left -= half_width;
339   m_Rect.right += half_width;
340   m_Rect.top += half_width;
341   m_Rect.bottom -= half_width;
342 
343   return ret;
344 }
345 
RecalcPositionData()346 void CPDF_TextObject::RecalcPositionData() {
347   CalcPositionData(1);
348 }
349