1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/include/fpdfapi/fpdf_page.h"
8 #include "core/include/fpdfapi/fpdf_pageobj.h"
9 #include "text_int.h"
10 
11 class CPDF_TextStream {
12  public:
13   CPDF_TextStream(CFX_WideTextBuf& buffer,
14                   FX_BOOL bUseLF,
15                   CFX_PtrArray* pObjArray);
~CPDF_TextStream()16   ~CPDF_TextStream() {}
17   FX_BOOL ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine);
18   CFX_WideTextBuf& m_Buffer;
19   FX_BOOL m_bUseLF;
20   CFX_PtrArray* m_pObjArray;
21   const CPDF_TextObject* m_pLastObj;
22 };
CPDF_TextStream(CFX_WideTextBuf & buffer,FX_BOOL bUseLF,CFX_PtrArray * pObjArray)23 CPDF_TextStream::CPDF_TextStream(CFX_WideTextBuf& buffer,
24                                  FX_BOOL bUseLF,
25                                  CFX_PtrArray* pObjArray)
26     : m_Buffer(buffer) {
27   m_pLastObj = NULL;
28   m_bUseLF = bUseLF;
29   m_pObjArray = pObjArray;
30 }
FPDFText_IsSameTextObject(const CPDF_TextObject * pTextObj1,const CPDF_TextObject * pTextObj2)31 FX_BOOL FPDFText_IsSameTextObject(const CPDF_TextObject* pTextObj1,
32                                   const CPDF_TextObject* pTextObj2) {
33   if (!pTextObj1 || !pTextObj2) {
34     return FALSE;
35   }
36   CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom,
37                          pTextObj2->m_Right, pTextObj2->m_Top);
38   CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom,
39                          pTextObj1->m_Right, pTextObj1->m_Top);
40   if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
41     return TRUE;
42   }
43   if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
44     rcPreObj.Intersect(rcCurObj);
45     if (rcPreObj.IsEmpty()) {
46       return FALSE;
47     }
48     if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
49         rcCurObj.Width() / 2) {
50       return FALSE;
51     }
52     if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
53       return FALSE;
54     }
55   }
56   int nPreCount = pTextObj2->CountItems();
57   int nCurCount = pTextObj1->CountItems();
58   if (nPreCount != nCurCount) {
59     return FALSE;
60   }
61   for (int i = 0; i < nPreCount; i++) {
62     CPDF_TextObjectItem itemPer, itemCur;
63     pTextObj2->GetItemInfo(i, &itemPer);
64     pTextObj1->GetItemInfo(i, &itemCur);
65     if (itemCur.m_CharCode != itemPer.m_CharCode) {
66       return FALSE;
67     }
68   }
69   return TRUE;
70 }
GetCharWidth(FX_DWORD charCode,CPDF_Font * pFont)71 int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) {
72   if (charCode == -1) {
73     return 0;
74   }
75   int w = pFont->GetCharWidthF(charCode);
76   if (w == 0) {
77     CFX_ByteString str;
78     pFont->AppendChar(str, charCode);
79     w = pFont->GetStringWidth(str, 1);
80     if (w == 0) {
81       FX_RECT BBox;
82       pFont->GetCharBBox(charCode, BBox);
83       w = BBox.right - BBox.left;
84     }
85   }
86   return w;
87 }
FPDFText_ProcessInterObj(const CPDF_TextObject * pPrevObj,const CPDF_TextObject * pObj)88 int FPDFText_ProcessInterObj(const CPDF_TextObject* pPrevObj,
89                              const CPDF_TextObject* pObj) {
90   if (FPDFText_IsSameTextObject(pPrevObj, pObj)) {
91     return -1;
92   }
93   CPDF_TextObjectItem item;
94   int nItem = pPrevObj->CountItems();
95   pPrevObj->GetItemInfo(nItem - 1, &item);
96   FX_WCHAR preChar = 0, curChar = 0;
97   CFX_WideString wstr =
98       pPrevObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
99   if (wstr.GetLength()) {
100     preChar = wstr.GetAt(0);
101   }
102   FX_FLOAT last_pos = item.m_OriginX;
103   int nLastWidth = GetCharWidth(item.m_CharCode, pPrevObj->GetFont());
104   FX_FLOAT last_width = nLastWidth * pPrevObj->GetFontSize() / 1000;
105   last_width = FXSYS_fabs(last_width);
106   pObj->GetItemInfo(0, &item);
107   wstr = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
108   if (wstr.GetLength()) {
109     curChar = wstr.GetAt(0);
110   }
111   int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
112   FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
113   this_width = FXSYS_fabs(this_width);
114   FX_FLOAT threshold =
115       last_width > this_width ? last_width / 4 : this_width / 4;
116   CFX_Matrix prev_matrix, prev_reverse;
117   pPrevObj->GetTextMatrix(&prev_matrix);
118   prev_reverse.SetReverse(prev_matrix);
119   FX_FLOAT x = pObj->GetPosX(), y = pObj->GetPosY();
120   prev_reverse.Transform(x, y);
121   if (FXSYS_fabs(y) > threshold * 2) {
122     return 2;
123   }
124   threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
125   threshold = threshold > 400
126                   ? (threshold < 700 ? threshold / 4 : threshold / 5)
127                   : (threshold / 2);
128   threshold *= nLastWidth > nThisWidth ? FXSYS_fabs(pPrevObj->GetFontSize())
129                                        : FXSYS_fabs(pObj->GetFontSize());
130   threshold /= 1000;
131   if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' &&
132       preChar != L' ')
133     if (curChar != L' ' && preChar != L' ') {
134       if ((x - last_pos - last_width) > threshold ||
135           (last_pos - x - last_width) > threshold) {
136         return 1;
137       }
138       if (x < 0 && (last_pos - x - last_width) > threshold) {
139         return 1;
140       }
141       if ((x - last_pos - last_width) > this_width ||
142           (x - last_pos - this_width) > last_width) {
143         return 1;
144       }
145     }
146   if (last_pos + last_width > x + this_width && curChar == L' ') {
147     return 3;
148   }
149   return 0;
150 }
ProcessObject(const CPDF_TextObject * pObj,FX_BOOL bFirstLine)151 FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj,
152                                        FX_BOOL bFirstLine) {
153   CPDF_Font* pFont = pObj->GetFont();
154   CFX_Matrix matrix;
155   pObj->GetTextMatrix(&matrix);
156   int item_index = 0;
157   if (m_pLastObj) {
158     int result = FPDFText_ProcessInterObj(m_pLastObj, pObj);
159     if (result == 2) {
160       int len = m_Buffer.GetLength();
161       if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') {
162         m_Buffer.Delete(len - 1, 1);
163         if (m_pObjArray) {
164           m_pObjArray->RemoveAt((len - 1) * 2, 2);
165         }
166       } else {
167         if (bFirstLine) {
168           return TRUE;
169         }
170         if (m_bUseLF) {
171           m_Buffer.AppendChar(L'\r');
172           m_Buffer.AppendChar(L'\n');
173           if (m_pObjArray) {
174             for (int i = 0; i < 4; i++) {
175               m_pObjArray->Add(NULL);
176             }
177           }
178         } else {
179           m_Buffer.AppendChar(' ');
180           if (m_pObjArray) {
181             m_pObjArray->Add(NULL);
182             m_pObjArray->Add(NULL);
183           }
184         }
185       }
186     } else if (result == 1) {
187       m_Buffer.AppendChar(L' ');
188       if (m_pObjArray) {
189         m_pObjArray->Add(NULL);
190         m_pObjArray->Add(NULL);
191       }
192     } else if (result == -1) {
193       m_pLastObj = pObj;
194       return FALSE;
195     } else if (result == 3) {
196       item_index = 1;
197     }
198   }
199   m_pLastObj = pObj;
200   int nItems = pObj->CountItems();
201   FX_FLOAT Ignorekerning = 0;
202   for (int i = 1; i < nItems - 1; i += 2) {
203     CPDF_TextObjectItem item;
204     pObj->GetItemInfo(i, &item);
205     if (item.m_CharCode == (FX_DWORD)-1) {
206       if (i == 1) {
207         Ignorekerning = item.m_OriginX;
208       } else if (Ignorekerning > item.m_OriginX) {
209         Ignorekerning = item.m_OriginX;
210       }
211     } else {
212       Ignorekerning = 0;
213       break;
214     }
215   }
216   FX_FLOAT spacing = 0;
217   for (; item_index < nItems; item_index++) {
218     CPDF_TextObjectItem item;
219     pObj->GetItemInfo(item_index, &item);
220     if (item.m_CharCode == (FX_DWORD)-1) {
221       CFX_WideString wstr = m_Buffer.GetWideString();
222       if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') {
223         continue;
224       }
225       FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
226       spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000;
227       continue;
228     }
229     FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace;
230     if (nItems > 3 && !spacing) {
231       charSpace = 0;
232     }
233     if ((spacing || charSpace) && item_index > 0) {
234       int last_width = 0;
235       FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
236       FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
237       FX_FLOAT threshold = 0;
238       if (space_charcode != -1) {
239         threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
240       }
241       if (threshold > fontsize_h / 3) {
242         threshold = 0;
243       } else {
244         threshold /= 2;
245       }
246       if (threshold == 0) {
247         threshold = fontsize_h;
248         int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
249         threshold = this_width > last_width ? (FX_FLOAT)this_width
250                                             : (FX_FLOAT)last_width;
251         int nDivide = 6;
252         if (threshold < 300) {
253           nDivide = 2;
254         } else if (threshold < 500) {
255           nDivide = 4;
256         } else if (threshold < 700) {
257           nDivide = 5;
258         }
259         threshold = threshold / nDivide;
260         threshold = fontsize_h * threshold / 1000;
261       }
262       if (charSpace > 0.001) {
263         spacing += matrix.TransformDistance(charSpace);
264       } else if (charSpace < -0.001) {
265         spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
266       }
267       if (threshold && (spacing && spacing >= threshold)) {
268         m_Buffer.AppendChar(L' ');
269         if (m_pObjArray) {
270           m_pObjArray->Add(NULL);
271           m_pObjArray->Add(NULL);
272         }
273       }
274       if (item.m_CharCode == (FX_DWORD)-1) {
275         continue;
276       }
277       spacing = 0;
278     }
279     CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode);
280     if (unicode_str.IsEmpty()) {
281       m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode);
282       if (m_pObjArray) {
283         m_pObjArray->Add((void*)pObj);
284         m_pObjArray->Add((void*)(intptr_t)item_index);
285       }
286     } else {
287       m_Buffer << unicode_str;
288       if (m_pObjArray) {
289         for (int i = 0; i < unicode_str.GetLength(); i++) {
290           m_pObjArray->Add((void*)pObj);
291           m_pObjArray->Add((void*)(intptr_t)item_index);
292         }
293       }
294     }
295   }
296   return FALSE;
297 }
GetTextStream_Unicode(CFX_WideTextBuf & buffer,CPDF_PageObjects * pPage,FX_BOOL bUseLF,CFX_PtrArray * pObjArray)298 void GetTextStream_Unicode(CFX_WideTextBuf& buffer,
299                            CPDF_PageObjects* pPage,
300                            FX_BOOL bUseLF,
301                            CFX_PtrArray* pObjArray) {
302   CPDF_TextStream textstream(buffer, bUseLF, pObjArray);
303   FX_POSITION pos = pPage->GetFirstObjectPosition();
304   while (pos) {
305     CPDF_PageObject* pObject = pPage->GetNextObject(pos);
306     if (pObject && pObject->m_Type == PDFPAGE_TEXT)
307       textstream.ProcessObject((CPDF_TextObject*)pObject, FALSE);
308   }
309 }
PDF_GetFirstTextLine_Unicode(CPDF_Document * pDoc,CPDF_Dictionary * pPage)310 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc,
311                                             CPDF_Dictionary* pPage) {
312   CFX_WideTextBuf buffer;
313   buffer.EstimateSize(0, 1024);
314   CPDF_Page page;
315   page.Load(pDoc, pPage);
316   CPDF_ParseOptions options;
317   options.m_bTextOnly = TRUE;
318   options.m_bSeparateForm = FALSE;
319   page.ParseContent(&options);
320   CPDF_TextStream textstream(buffer, FALSE, NULL);
321   FX_POSITION pos = page.GetFirstObjectPosition();
322   while (pos) {
323     CPDF_PageObject* pObject = page.GetNextObject(pos);
324     if (pObject->m_Type != PDFPAGE_TEXT) {
325       continue;
326     }
327     if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) {
328       break;
329     }
330   }
331   return buffer.GetWideString();
332 }
333