1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include <cctype>
8 #include <cwctype>
9 #include <memory>
10
11 #include "core/include/fpdfapi/fpdf_page.h"
12 #include "core/include/fpdfapi/fpdf_pageobj.h"
13 #include "core/include/fpdfapi/fpdf_resource.h"
14 #include "core/include/fpdftext/fpdf_text.h"
15 #include "core/include/fxcrt/fx_bidi.h"
16 #include "core/include/fxcrt/fx_ucd.h"
17 #include "text_int.h"
18 #include "txtproc.h"
19
CharFromUnicodeAlt(FX_WCHAR unicode,int destcp,const FX_CHAR * defchar)20 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode,
21 int destcp,
22 const FX_CHAR* defchar) {
23 if (destcp == 0) {
24 if (unicode < 0x80) {
25 return CFX_ByteString((char)unicode);
26 }
27 const FX_CHAR* altstr = FCS_GetAltStr(unicode);
28 return CFX_ByteString(altstr ? altstr : defchar);
29 }
30 char buf[10];
31 int iDef = 0;
32 int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10,
33 NULL, &iDef);
34 if (ret && !iDef) {
35 return CFX_ByteString(buf, ret);
36 }
37 const FX_CHAR* altstr = FCS_GetAltStr(unicode);
38 return CFX_ByteString(altstr ? altstr : defchar);
39 }
CTextPage()40 CTextPage::CTextPage() {}
~CTextPage()41 CTextPage::~CTextPage() {
42 int i;
43 for (i = 0; i < m_BaseLines.GetSize(); i++) {
44 delete m_BaseLines.GetAt(i);
45 }
46 for (i = 0; i < m_TextColumns.GetSize(); i++) {
47 delete m_TextColumns.GetAt(i);
48 }
49 }
ProcessObject(CPDF_PageObject * pObject)50 void CTextPage::ProcessObject(CPDF_PageObject* pObject) {
51 if (pObject->m_Type != PDFPAGE_TEXT) {
52 return;
53 }
54 CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
55 CPDF_Font* pFont = pText->m_TextState.GetFont();
56 int count = pText->CountItems();
57 FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2);
58 pText->CalcCharPos(pPosArray);
59
60 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
61 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
62 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
63 FX_FLOAT spacew = 0;
64 if (space_charcode != -1) {
65 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
66 }
67 if (spacew == 0) {
68 spacew = fontsize_h / 4;
69 }
70 if (pText->m_TextState.GetBaselineAngle() != 0) {
71 int cc = 0;
72 CFX_Matrix matrix;
73 pText->GetTextMatrix(&matrix);
74 for (int i = 0; i < pText->m_nChars; i++) {
75 FX_DWORD charcode = pText->m_nChars == 1
76 ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes
77 : pText->m_pCharCodes[i];
78 if (charcode == (FX_DWORD)-1) {
79 continue;
80 }
81 FX_RECT char_box;
82 pFont->GetCharBBox(charcode, char_box);
83 FX_FLOAT char_left =
84 pPosArray ? pPosArray[cc * 2]
85 : char_box.left * pText->m_TextState.GetFontSize() / 1000;
86 FX_FLOAT char_right =
87 pPosArray ? pPosArray[cc * 2 + 1]
88 : char_box.right * pText->m_TextState.GetFontSize() / 1000;
89 FX_FLOAT char_top =
90 char_box.top * pText->m_TextState.GetFontSize() / 1000;
91 FX_FLOAT char_bottom =
92 char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
93 cc++;
94 FX_FLOAT char_origx, char_origy;
95 matrix.Transform(char_left, 0, char_origx, char_origy);
96 matrix.TransformRect(char_left, char_right, char_top, char_bottom);
97 CFX_ByteString str;
98 pFont->AppendChar(str, charcode);
99 InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
100 char_bottom, spacew, fontsize_v, str, pFont);
101 }
102 FX_Free(pPosArray);
103 return;
104 }
105 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
106 for (int ii = 0; ii < count * 2; ii++) {
107 pPosArray[ii] *= ratio_h;
108 }
109 FX_FLOAT baseline = pText->m_PosY;
110 CTextBaseLine* pBaseLine = NULL;
111 FX_FLOAT topy = pText->m_Top;
112 FX_FLOAT bottomy = pText->m_Bottom;
113 FX_FLOAT leftx = pText->m_Left;
114 int cc = 0;
115 CFX_ByteString segment;
116 int space_count = 0;
117 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
118 for (int i = 0; i < pText->m_nChars; i++) {
119 FX_DWORD charcode = pText->m_nChars == 1
120 ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes
121 : pText->m_pCharCodes[i];
122 if (charcode == (FX_DWORD)-1) {
123 continue;
124 }
125 FX_FLOAT char_left = pPosArray[cc * 2];
126 FX_FLOAT char_right = pPosArray[cc * 2 + 1];
127 cc++;
128 if (char_left < last_left || (char_left - last_right) > spacew / 2) {
129 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
130 leftx + segment_right, topy, bottomy, spacew,
131 fontsize_v, segment, pFont);
132 segment_left = char_left;
133 segment = "";
134 }
135 if (space_count > 1) {
136 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
137 leftx + segment_right, topy, bottomy, spacew,
138 fontsize_v, segment, pFont);
139 segment = "";
140 } else if (space_count == 1) {
141 pFont->AppendChar(segment, ' ');
142 }
143 if (segment.GetLength() == 0) {
144 segment_left = char_left;
145 }
146 segment_right = char_right;
147 pFont->AppendChar(segment, charcode);
148 space_count = 0;
149 last_left = char_left;
150 last_right = char_right;
151 }
152 if (segment.GetLength())
153 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
154 leftx + segment_right, topy, bottomy, spacew,
155 fontsize_v, segment, pFont);
156 FX_Free(pPosArray);
157 }
InsertTextBox(CTextBaseLine * pBaseLine,FX_FLOAT basey,FX_FLOAT leftx,FX_FLOAT rightx,FX_FLOAT topy,FX_FLOAT bottomy,FX_FLOAT spacew,FX_FLOAT fontsize_v,CFX_ByteString & str,CPDF_Font * pFont)158 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine,
159 FX_FLOAT basey,
160 FX_FLOAT leftx,
161 FX_FLOAT rightx,
162 FX_FLOAT topy,
163 FX_FLOAT bottomy,
164 FX_FLOAT spacew,
165 FX_FLOAT fontsize_v,
166 CFX_ByteString& str,
167 CPDF_Font* pFont) {
168 if (str.GetLength() == 0) {
169 return NULL;
170 }
171 if (!pBaseLine) {
172 int i;
173 for (i = 0; i < m_BaseLines.GetSize(); i++) {
174 CTextBaseLine* pExistLine = m_BaseLines.GetAt(i);
175 if (pExistLine->m_BaseLine == basey) {
176 pBaseLine = pExistLine;
177 break;
178 }
179 if (pExistLine->m_BaseLine < basey) {
180 break;
181 }
182 }
183 if (!pBaseLine) {
184 pBaseLine = new CTextBaseLine;
185 pBaseLine->m_BaseLine = basey;
186 m_BaseLines.InsertAt(i, pBaseLine);
187 }
188 }
189 CFX_WideString text;
190 const FX_CHAR* pStr = str;
191 int len = str.GetLength(), offset = 0;
192 while (offset < len) {
193 FX_DWORD ch = pFont->GetNextChar(pStr, len, offset);
194 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
195 if (unicode_str.IsEmpty()) {
196 text += (FX_WCHAR)ch;
197 } else {
198 text += unicode_str;
199 }
200 }
201 pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v,
202 text);
203 return pBaseLine;
204 }
WriteOutput(CFX_WideStringArray & lines,int iMinWidth)205 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) {
206 FX_FLOAT lastheight = -1;
207 FX_FLOAT lastbaseline = -1;
208 FX_FLOAT MinLeftX = 1000000;
209 FX_FLOAT MaxRightX = 0;
210 int i;
211 for (i = 0; i < m_BaseLines.GetSize(); i++) {
212 CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
213 FX_FLOAT leftx, rightx;
214 if (pBaseLine->GetWidth(leftx, rightx)) {
215 if (leftx < MinLeftX) {
216 MinLeftX = leftx;
217 }
218 if (rightx > MaxRightX) {
219 MaxRightX = rightx;
220 }
221 }
222 }
223 for (i = 0; i < m_BaseLines.GetSize(); i++) {
224 m_BaseLines.GetAt(i)->MergeBoxes();
225 }
226 for (i = 1; i < m_BaseLines.GetSize(); i++) {
227 CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
228 CTextBaseLine* pPrevLine = m_BaseLines.GetAt(i - 1);
229 if (pBaseLine->CanMerge(pPrevLine)) {
230 pPrevLine->Merge(pBaseLine);
231 delete pBaseLine;
232 m_BaseLines.RemoveAt(i);
233 i--;
234 }
235 }
236 if (m_bAutoWidth) {
237 int* widths = FX_Alloc(int, m_BaseLines.GetSize());
238 for (i = 0; i < m_BaseLines.GetSize(); i++) {
239 widths[i] = 0;
240 CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
241 int TotalChars = 0;
242 FX_FLOAT TotalWidth = 0;
243 int minchars;
244 pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
245 if (TotalChars) {
246 FX_FLOAT charwidth = TotalWidth / TotalChars;
247 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
248 }
249 if (widths[i] > 1000) {
250 widths[i] = 1000;
251 }
252 if (widths[i] < minchars) {
253 widths[i] = minchars;
254 }
255 }
256 int AvgWidth = 0, widthcount = 0;
257 for (i = 0; i < m_BaseLines.GetSize(); i++)
258 if (widths[i]) {
259 AvgWidth += widths[i];
260 widthcount++;
261 }
262 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
263 int MaxWidth = 0;
264 for (i = 0; i < m_BaseLines.GetSize(); i++)
265 if (MaxWidth < widths[i]) {
266 MaxWidth = widths[i];
267 }
268 if (MaxWidth > AvgWidth * 6 / 5) {
269 MaxWidth = AvgWidth * 6 / 5;
270 }
271 FX_Free(widths);
272 if (iMinWidth < MaxWidth) {
273 iMinWidth = MaxWidth;
274 }
275 }
276 for (i = 0; i < m_BaseLines.GetSize(); i++) {
277 m_BaseLines.GetAt(i)->MergeBoxes();
278 }
279 if (m_bKeepColumn) {
280 FindColumns();
281 }
282 for (i = 0; i < m_BaseLines.GetSize(); i++) {
283 CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
284 if (lastheight >= 0) {
285 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
286 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
287 lines.Add(L"");
288 }
289 }
290 lastheight = pBaseLine->m_MaxFontSizeV;
291 lastbaseline = pBaseLine->m_BaseLine;
292 CFX_WideString str;
293 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
294 lines.Add(str);
295 }
296 }
NormalizeCompositeChar(FX_WCHAR wChar,CFX_WideString & sDest)297 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) {
298 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
299 FX_WCHAR* pDst = NULL;
300 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
301 if (nCount < 1) {
302 sDest += wChar;
303 return;
304 }
305 pDst = new FX_WCHAR[nCount];
306 FX_Unicode_GetNormalization(wChar, pDst);
307 for (int nIndex = 0; nIndex < nCount; nIndex++) {
308 sDest += pDst[nIndex];
309 }
310 delete[] pDst;
311 }
NormalizeString(CFX_WideString & str)312 void NormalizeString(CFX_WideString& str) {
313 if (str.GetLength() <= 0) {
314 return;
315 }
316 CFX_WideString sBuffer;
317 std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar);
318 CFX_WordArray order;
319 FX_BOOL bR2L = FALSE;
320 int32_t start = 0, count = 0, i = 0;
321 int nR2L = 0, nL2R = 0;
322 for (i = 0; i < str.GetLength(); i++) {
323 if (pBidiChar->AppendChar(str.GetAt(i))) {
324 CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
325 order.Add(start);
326 order.Add(count);
327 order.Add(ret);
328 if (!bR2L) {
329 if (ret == CFX_BidiChar::RIGHT) {
330 nR2L++;
331 } else if (ret == CFX_BidiChar::LEFT) {
332 nL2R++;
333 }
334 }
335 }
336 }
337 if (pBidiChar->EndChar()) {
338 CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
339 order.Add(start);
340 order.Add(count);
341 order.Add(ret);
342 if (!bR2L) {
343 if (ret == CFX_BidiChar::RIGHT) {
344 nR2L++;
345 } else if (ret == CFX_BidiChar::LEFT) {
346 nL2R++;
347 }
348 }
349 }
350 if (nR2L > 0 && nR2L >= nL2R) {
351 bR2L = TRUE;
352 }
353 if (bR2L) {
354 int count = order.GetSize();
355 for (int j = count - 1; j > 0; j -= 3) {
356 int ret = order.GetAt(j);
357 int start = order.GetAt(j - 2);
358 int count1 = order.GetAt(j - 1);
359 if (ret == 2 || ret == 0) {
360 for (int i = start + count1 - 1; i >= start; i--) {
361 NormalizeCompositeChar(str[i], sBuffer);
362 }
363 } else {
364 i = j;
365 FX_BOOL bSymbol = FALSE;
366 while (i > 0 && order.GetAt(i) != 2) {
367 bSymbol = !order.GetAt(i);
368 i -= 3;
369 }
370 int end = start + count1;
371 int n = 0;
372 if (bSymbol) {
373 n = i + 6;
374 } else {
375 n = i + 3;
376 }
377 if (n >= j) {
378 for (int m = start; m < end; m++) {
379 sBuffer += str[m];
380 }
381 } else {
382 i = j;
383 j = n;
384 for (; n <= i; n += 3) {
385 int start = order.GetAt(n - 2);
386 int count1 = order.GetAt(n - 1);
387 int end = start + count1;
388 for (int m = start; m < end; m++) {
389 sBuffer += str[m];
390 }
391 }
392 }
393 }
394 }
395 } else {
396 int count = order.GetSize();
397 FX_BOOL bL2R = FALSE;
398 for (int j = 0; j < count; j += 3) {
399 int ret = order.GetAt(j + 2);
400 int start = order.GetAt(j);
401 int count1 = order.GetAt(j + 1);
402 if (ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
403 int i = j + 3;
404 while (bR2L && i < count) {
405 if (order.GetAt(i + 2) == 1) {
406 break;
407 } else {
408 i += 3;
409 }
410 }
411 if (i == 3) {
412 j = -3;
413 bL2R = TRUE;
414 continue;
415 }
416 int end = str.GetLength() - 1;
417 if (i < count) {
418 end = order.GetAt(i) - 1;
419 }
420 j = i - 3;
421 for (int n = end; n >= start; n--) {
422 NormalizeCompositeChar(str[i], sBuffer);
423 }
424 } else {
425 int end = start + count1;
426 for (int i = start; i < end; i++) {
427 sBuffer += str[i];
428 }
429 }
430 }
431 }
432 str.Empty();
433 str += sBuffer;
434 }
IsNumber(CFX_WideString & str)435 static FX_BOOL IsNumber(CFX_WideString& str) {
436 for (int i = 0; i < str.GetLength(); i++) {
437 FX_WCHAR ch = str[i];
438 // TODO(dsinclair): --.+ +.-- should probably not be a number.
439 if (!std::iswdigit(ch) && ch != '-' && ch != '+' && ch != '.' && ch != ' ')
440 return FALSE;
441 }
442 return TRUE;
443 }
FindColumns()444 void CTextPage::FindColumns() {
445 int i;
446 for (i = 0; i < m_BaseLines.GetSize(); i++) {
447 CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
448 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) {
449 CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j);
450 CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
451 if (pColumn) {
452 pColumn->m_AvgPos =
453 (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
454 (pColumn->m_Count + 1);
455 pColumn->m_Count++;
456 } else {
457 pColumn = new CTextColumn;
458 pColumn->m_Count = 1;
459 pColumn->m_AvgPos = pTextBox->m_Right;
460 pColumn->m_TextPos = -1;
461 m_TextColumns.Add(pColumn);
462 }
463 }
464 }
465 int mincount = m_BaseLines.GetSize() / 4;
466 for (i = 0; i < m_TextColumns.GetSize(); i++) {
467 CTextColumn* pTextColumn = m_TextColumns.GetAt(i);
468 if (pTextColumn->m_Count >= mincount) {
469 continue;
470 }
471 delete pTextColumn;
472 m_TextColumns.RemoveAt(i);
473 i--;
474 }
475 for (i = 0; i < m_BaseLines.GetSize(); i++) {
476 CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
477 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) {
478 CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j);
479 if (IsNumber(pTextBox->m_Text)) {
480 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
481 }
482 }
483 }
484 }
FindColumn(FX_FLOAT xpos)485 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) {
486 for (int i = 0; i < m_TextColumns.GetSize(); i++) {
487 CTextColumn* pColumn = m_TextColumns.GetAt(i);
488 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
489 return pColumn;
490 }
491 }
492 return NULL;
493 }
BreakSpace(CPDF_TextObject * pTextObj)494 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) {}
CTextBaseLine()495 CTextBaseLine::CTextBaseLine() {
496 m_Top = -100000;
497 m_Bottom = 100000;
498 m_MaxFontSizeV = 0;
499 }
~CTextBaseLine()500 CTextBaseLine::~CTextBaseLine() {
501 for (int i = 0; i < m_TextList.GetSize(); i++) {
502 delete m_TextList.GetAt(i);
503 }
504 }
InsertTextBox(FX_FLOAT leftx,FX_FLOAT rightx,FX_FLOAT topy,FX_FLOAT bottomy,FX_FLOAT spacew,FX_FLOAT fontsize_v,const CFX_WideString & text)505 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx,
506 FX_FLOAT rightx,
507 FX_FLOAT topy,
508 FX_FLOAT bottomy,
509 FX_FLOAT spacew,
510 FX_FLOAT fontsize_v,
511 const CFX_WideString& text) {
512 if (m_Top < topy) {
513 m_Top = topy;
514 }
515 if (m_Bottom > bottomy) {
516 m_Bottom = bottomy;
517 }
518 if (m_MaxFontSizeV < fontsize_v) {
519 m_MaxFontSizeV = fontsize_v;
520 }
521 int i;
522 for (i = 0; i < m_TextList.GetSize(); i++) {
523 CTextBox* pText = m_TextList.GetAt(i);
524 if (pText->m_Left > leftx) {
525 break;
526 }
527 }
528 CTextBox* pText = new CTextBox;
529 pText->m_Text = text;
530 pText->m_Left = leftx;
531 pText->m_Right = rightx;
532 pText->m_Top = topy;
533 pText->m_Bottom = bottomy;
534 pText->m_SpaceWidth = spacew;
535 pText->m_FontSizeV = fontsize_v;
536 pText->m_pColumn = NULL;
537 m_TextList.InsertAt(i, pText);
538 }
539 FX_BOOL GetIntersection(FX_FLOAT low1,
540 FX_FLOAT high1,
541 FX_FLOAT low2,
542 FX_FLOAT high2,
543 FX_FLOAT& interlow,
544 FX_FLOAT& interhigh);
CanMerge(CTextBaseLine * pOther)545 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) {
546 FX_FLOAT inter_top, inter_bottom;
547 if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
548 inter_bottom, inter_top)) {
549 return FALSE;
550 }
551 FX_FLOAT inter_h = inter_top - inter_bottom;
552 if (inter_h < (m_Top - m_Bottom) / 2 &&
553 inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
554 return FALSE;
555 }
556 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
557 for (int i = 0; i < m_TextList.GetSize(); i++) {
558 CTextBox* pText = m_TextList.GetAt(i);
559 for (int j = 0; j < pOther->m_TextList.GetSize(); j++) {
560 CTextBox* pOtherText = pOther->m_TextList.GetAt(j);
561 FX_FLOAT inter_left, inter_right;
562 if (!GetIntersection(pText->m_Left, pText->m_Right, pOtherText->m_Left,
563 pOtherText->m_Right, inter_left, inter_right)) {
564 continue;
565 }
566 FX_FLOAT inter_w = inter_right - inter_left;
567 if (inter_w < pText->m_SpaceWidth / 2 &&
568 inter_w < pOtherText->m_SpaceWidth / 2) {
569 continue;
570 }
571 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
572 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
573 return FALSE;
574 }
575 }
576 }
577 return TRUE;
578 }
Merge(CTextBaseLine * pOther)579 void CTextBaseLine::Merge(CTextBaseLine* pOther) {
580 for (int i = 0; i < pOther->m_TextList.GetSize(); i++) {
581 CTextBox* pText = pOther->m_TextList.GetAt(i);
582 InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom,
583 pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
584 }
585 }
GetWidth(FX_FLOAT & leftx,FX_FLOAT & rightx)586 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) {
587 int i;
588 for (i = 0; i < m_TextList.GetSize(); i++) {
589 CTextBox* pText = m_TextList.GetAt(i);
590 if (pText->m_Text != L" ") {
591 break;
592 }
593 }
594 if (i == m_TextList.GetSize()) {
595 return FALSE;
596 }
597 CTextBox* pText = m_TextList.GetAt(i);
598 leftx = pText->m_Left;
599 for (i = m_TextList.GetSize() - 1; i >= 0; i--) {
600 CTextBox* pText = m_TextList.GetAt(i);
601 if (pText->m_Text != L" ") {
602 break;
603 }
604 }
605 pText = m_TextList.GetAt(i);
606 rightx = pText->m_Right;
607 return TRUE;
608 }
MergeBoxes()609 void CTextBaseLine::MergeBoxes() {
610 int i = 0;
611 while (1) {
612 if (i >= m_TextList.GetSize() - 1) {
613 break;
614 }
615 CTextBox* pThisText = m_TextList.GetAt(i);
616 CTextBox* pNextText = m_TextList.GetAt(i + 1);
617 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
618 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0)
619 ? pNextText->m_SpaceWidth
620 : pThisText->m_SpaceWidth;
621 if (spacew > 0.0 && dx < spacew * 2) {
622 pThisText->m_Right = pNextText->m_Right;
623 if (dx > spacew * 1.5) {
624 pThisText->m_Text += L" ";
625 } else if (dx > spacew / 3) {
626 pThisText->m_Text += L' ';
627 }
628 pThisText->m_Text += pNextText->m_Text;
629 pThisText->m_SpaceWidth =
630 pNextText->m_SpaceWidth == 0.0 ? spacew : pNextText->m_SpaceWidth;
631 m_TextList.RemoveAt(i + 1);
632 delete pNextText;
633 } else {
634 i++;
635 }
636 }
637 }
WriteOutput(CFX_WideString & str,FX_FLOAT leftx,FX_FLOAT pagewidth,int iTextWidth)638 void CTextBaseLine::WriteOutput(CFX_WideString& str,
639 FX_FLOAT leftx,
640 FX_FLOAT pagewidth,
641 int iTextWidth) {
642 int lastpos = -1;
643 for (int i = 0; i < m_TextList.GetSize(); i++) {
644 CTextBox* pText = m_TextList.GetAt(i);
645 int xpos;
646 if (pText->m_pColumn) {
647 xpos =
648 (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth +
649 0.5);
650 xpos -= pText->m_Text.GetLength();
651 } else {
652 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
653 }
654 if (xpos <= lastpos) {
655 xpos = lastpos + 1;
656 }
657 for (int j = lastpos + 1; j < xpos; j++) {
658 str += ' ';
659 }
660 CFX_WideString sSrc(pText->m_Text);
661 NormalizeString(sSrc);
662 str += sSrc;
663 str += ' ';
664 lastpos = xpos + pText->m_Text.GetLength();
665 }
666 }
CountChars(int & count,FX_FLOAT & width,int & minchars)667 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) {
668 minchars = 0;
669 for (int i = 0; i < m_TextList.GetSize(); i++) {
670 CTextBox* pText = m_TextList.GetAt(i);
671 if (pText->m_Right - pText->m_Left < 0.002) {
672 continue;
673 }
674 count += pText->m_Text.GetLength();
675 width += pText->m_Right - pText->m_Left;
676 minchars += pText->m_Text.GetLength() + 1;
677 }
678 }
679 #define PI 3.1415926535897932384626433832795
CheckRotate(CPDF_Page & page,CFX_FloatRect & page_bbox)680 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) {
681 int total_count = 0, rotated_count[3] = {0, 0, 0};
682 FX_POSITION pos = page.GetFirstObjectPosition();
683 while (pos) {
684 CPDF_PageObject* pObj = page.GetNextObject(pos);
685 if (pObj->m_Type != PDFPAGE_TEXT) {
686 continue;
687 }
688 total_count++;
689 CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
690 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
691 if (angle == 0.0) {
692 continue;
693 }
694 int degree = (int)(angle * 180 / PI + 0.5);
695 if (degree % 90) {
696 continue;
697 }
698 if (degree < 0) {
699 degree += 360;
700 }
701 int index = degree / 90 % 3 - 1;
702 if (index < 0) {
703 continue;
704 }
705 rotated_count[index]++;
706 }
707 if (total_count == 0) {
708 return;
709 }
710 CFX_Matrix matrix;
711 if (rotated_count[0] > total_count * 2 / 3) {
712 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
713 } else if (rotated_count[1] > total_count * 2 / 3) {
714 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
715 } else if (rotated_count[2] > total_count * 2 / 3) {
716 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
717 } else {
718 return;
719 }
720 page.Transform(matrix);
721 page_bbox.Transform(&matrix);
722 }
PDF_GetPageText_Unicode(CFX_WideStringArray & lines,CPDF_Document * pDoc,CPDF_Dictionary * pPage,int iMinWidth,FX_DWORD flags)723 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines,
724 CPDF_Document* pDoc,
725 CPDF_Dictionary* pPage,
726 int iMinWidth,
727 FX_DWORD flags) {
728 lines.RemoveAll();
729 if (!pPage) {
730 return;
731 }
732 CPDF_Page page;
733 page.Load(pDoc, pPage);
734 CPDF_ParseOptions options;
735 options.m_bTextOnly = TRUE;
736 options.m_bSeparateForm = FALSE;
737 page.ParseContent(&options);
738 CFX_FloatRect page_bbox = page.GetPageBBox();
739 if (flags & PDF2TXT_AUTO_ROTATE) {
740 CheckRotate(page, page_bbox);
741 }
742 CTextPage texts;
743 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
744 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
745 texts.m_bBreakSpace = TRUE;
746 FX_POSITION pos = page.GetFirstObjectPosition();
747 while (pos) {
748 CPDF_PageObject* pObject = page.GetNextObject(pos);
749 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
750 CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right,
751 pObject->m_Top);
752 if (!page_bbox.Contains(rect)) {
753 continue;
754 }
755 }
756 texts.ProcessObject(pObject);
757 }
758 texts.WriteOutput(lines, iMinWidth);
759 }
PDF_GetPageText(CFX_ByteStringArray & lines,CPDF_Document * pDoc,CPDF_Dictionary * pPage,int iMinWidth,FX_DWORD flags)760 void PDF_GetPageText(CFX_ByteStringArray& lines,
761 CPDF_Document* pDoc,
762 CPDF_Dictionary* pPage,
763 int iMinWidth,
764 FX_DWORD flags) {
765 lines.RemoveAll();
766 CFX_WideStringArray wlines;
767 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
768 for (int i = 0; i < wlines.GetSize(); i++) {
769 CFX_WideString wstr = wlines[i];
770 CFX_ByteString str;
771 for (int c = 0; c < wstr.GetLength(); c++) {
772 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
773 }
774 lines.Add(str);
775 }
776 }
PDF_GetTextStream_Unicode(CFX_WideTextBuf & buffer,CPDF_Document * pDoc,CPDF_Dictionary * pPage,FX_DWORD flags)777 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer,
778 CPDF_Document* pDoc,
779 CPDF_Dictionary* pPage,
780 FX_DWORD flags) {
781 buffer.EstimateSize(0, 10240);
782 CPDF_Page page;
783 page.Load(pDoc, pPage);
784 CPDF_ParseOptions options;
785 options.m_bTextOnly = TRUE;
786 options.m_bSeparateForm = FALSE;
787 page.ParseContent(&options);
788 GetTextStream_Unicode(buffer, &page, TRUE, NULL);
789 }
790