1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "../../include/fpdfapi/fpdf_page.h"
8 #include "../../include/fpdfapi/fpdf_pageobj.h"
9 #include "../../include/fpdftext/fpdf_text.h"
10 #include "txtproc.h"
11 #include "text_int.h"
12 extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR);
CharFromUnicodeAlt(FX_WCHAR unicode,int destcp,FX_LPCSTR defchar)13 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, FX_LPCSTR defchar)
14 {
15 if (destcp == 0) {
16 if (unicode < 0x80) {
17 return CFX_ByteString((char)unicode);
18 }
19 FX_LPCSTR altstr = FCS_GetAltStr(unicode);
20 if (altstr) {
21 return CFX_ByteString(altstr, -1);
22 }
23 return CFX_ByteString(defchar, -1);
24 }
25 FX_BOOL bDef = FALSE;
26 char buf[10];
27 int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, NULL, &bDef);
28 if (ret && !bDef) {
29 return CFX_ByteString(buf, ret);
30 }
31 FX_LPCSTR altstr = FCS_GetAltStr(unicode);
32 if (altstr) {
33 return CFX_ByteString(altstr, -1);
34 }
35 return CFX_ByteString(defchar, -1);
36 }
CTextPage()37 CTextPage::CTextPage()
38 {
39 }
~CTextPage()40 CTextPage::~CTextPage()
41 {
42 int i;
43 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
44 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
45 delete pBaseLine;
46 }
47 for (i = 0; i < m_TextColumns.GetSize(); i ++) {
48 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
49 delete pTextColumn;
50 }
51 }
ProcessObject(CPDF_PageObject * pObject)52 void CTextPage::ProcessObject(CPDF_PageObject* pObject)
53 {
54 if (pObject->m_Type != PDFPAGE_TEXT) {
55 return;
56 }
57 CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
58 CPDF_Font* pFont = pText->m_TextState.GetFont();
59 int count = pText->CountItems();
60 FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2);
61 pText->CalcCharPos(pPosArray);
62
63 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
64 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
65 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
66 FX_FLOAT spacew = 0;
67 if (space_charcode != -1) {
68 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
69 }
70 if (spacew == 0) {
71 spacew = fontsize_h / 4;
72 }
73 if (pText->m_TextState.GetBaselineAngle() != 0) {
74 int cc = 0;
75 CFX_AffineMatrix matrix;
76 pText->GetTextMatrix(&matrix);
77 for (int i = 0; i < pText->m_nChars; i ++) {
78 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
79 if (charcode == (FX_DWORD) - 1) {
80 continue;
81 }
82 FX_RECT char_box;
83 pFont->GetCharBBox(charcode, char_box);
84 FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000;
85 FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.right * pText->m_TextState.GetFontSize() / 1000;
86 FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000;
87 FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
88 cc ++;
89 FX_FLOAT char_origx, char_origy;
90 matrix.Transform(char_left, 0, char_origx, char_origy);
91 matrix.TransformRect(char_left, char_right, char_top, char_bottom);
92 CFX_ByteString str;
93 pFont->AppendChar(str, charcode);
94 InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
95 char_bottom, spacew, fontsize_v, str, pFont);
96 }
97 if (pPosArray) {
98 FX_Free(pPosArray);
99 }
100 return;
101 }
102 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
103 for (int ii = 0; ii < count * 2; ii ++) {
104 pPosArray[ii] *= ratio_h;
105 }
106 FX_FLOAT baseline = pText->m_PosY;
107 CTextBaseLine* pBaseLine = NULL;
108 FX_FLOAT topy = pText->m_Top;
109 FX_FLOAT bottomy = pText->m_Bottom;
110 FX_FLOAT leftx = pText->m_Left;
111 int cc = 0;
112 CFX_ByteString segment;
113 int space_count = 0;
114 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
115 for (int i = 0; i < pText->m_nChars; i ++) {
116 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
117 if (charcode == (FX_DWORD) - 1) {
118 continue;
119 }
120 FX_FLOAT char_left = pPosArray[cc * 2];
121 FX_FLOAT char_right = pPosArray[cc * 2 + 1];
122 cc ++;
123 if (char_left < last_left || (char_left - last_right) > spacew / 2) {
124 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
125 topy, bottomy, spacew, fontsize_v, segment, pFont);
126 segment_left = char_left;
127 segment = "";
128 }
129 if (space_count > 1) {
130 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
131 topy, bottomy, spacew, fontsize_v, segment, pFont);
132 segment = "";
133 } else if (space_count == 1) {
134 pFont->AppendChar(segment, ' ');
135 }
136 if (segment.GetLength() == 0) {
137 segment_left = char_left;
138 }
139 segment_right = char_right;
140 pFont->AppendChar(segment, charcode);
141 space_count = 0;
142 last_left = char_left;
143 last_right = char_right;
144 }
145 if (segment.GetLength())
146 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
147 topy, bottomy, spacew, fontsize_v, segment, pFont);
148 FX_Free(pPosArray);
149 }
InsertTextBox(CTextBaseLine * pBaseLine,FX_FLOAT basey,FX_FLOAT leftx,FX_FLOAT rightx,FX_FLOAT topy,FX_FLOAT bottomy,FX_FLOAT spacew,FX_FLOAT fontsize_v,CFX_ByteString & str,CPDF_Font * pFont)150 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey, FX_FLOAT leftx,
151 FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v,
152 CFX_ByteString& str, CPDF_Font* pFont)
153 {
154 if (str.GetLength() == 0) {
155 return NULL;
156 }
157 if (pBaseLine == NULL) {
158 int i;
159 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
160 CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
161 if (pExistLine->m_BaseLine == basey) {
162 pBaseLine = pExistLine;
163 break;
164 }
165 if (pExistLine->m_BaseLine < basey) {
166 break;
167 }
168 }
169 if (pBaseLine == NULL) {
170 pBaseLine = new CTextBaseLine;
171 pBaseLine->m_BaseLine = basey;
172 m_BaseLines.InsertAt(i, pBaseLine);
173 }
174 }
175 CFX_WideString text;
176 FX_LPCSTR pStr = str;
177 int len = str.GetLength(), offset = 0;
178 while (offset < len) {
179 FX_DWORD ch = pFont->GetNextChar(pStr, len, offset);
180 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
181 if (unicode_str.IsEmpty()) {
182 text += (FX_WCHAR)ch;
183 }
184 else {
185 text += unicode_str;
186 }
187 }
188 pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, text);
189 return pBaseLine;
190 }
WriteOutput(CFX_WideStringArray & lines,int iMinWidth)191 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth)
192 {
193 FX_FLOAT lastheight = -1;
194 FX_FLOAT lastbaseline = -1;
195 FX_FLOAT MinLeftX = 1000000;
196 FX_FLOAT MaxRightX = 0;
197 int i;
198 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
199 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
200 FX_FLOAT leftx, rightx;
201 if (pBaseLine->GetWidth(leftx, rightx)) {
202 if (leftx < MinLeftX) {
203 MinLeftX = leftx;
204 }
205 if (rightx > MaxRightX) {
206 MaxRightX = rightx;
207 }
208 }
209 }
210 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
211 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
212 pBaseLine->MergeBoxes();
213 }
214 for (i = 1; i < m_BaseLines.GetSize(); i ++) {
215 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
216 CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1);
217 if (pBaseLine->CanMerge(pPrevLine)) {
218 pPrevLine->Merge(pBaseLine);
219 delete pBaseLine;
220 m_BaseLines.RemoveAt(i);
221 i --;
222 }
223 }
224 if (m_bAutoWidth) {
225 int* widths = FX_Alloc(int, m_BaseLines.GetSize());
226 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
227 widths[i] = 0;
228 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
229 int TotalChars = 0;
230 FX_FLOAT TotalWidth = 0;
231 int minchars;
232 pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
233 if (TotalChars) {
234 FX_FLOAT charwidth = TotalWidth / TotalChars;
235 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
236 }
237 if (widths[i] > 1000) {
238 widths[i] = 1000;
239 }
240 if (widths[i] < minchars) {
241 widths[i] = minchars;
242 }
243 }
244 int AvgWidth = 0, widthcount = 0;
245 for (i = 0; i < m_BaseLines.GetSize(); i ++)
246 if (widths[i]) {
247 AvgWidth += widths[i];
248 widthcount ++;
249 }
250 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
251 int MaxWidth = 0;
252 for (i = 0; i < m_BaseLines.GetSize(); i ++)
253 if (MaxWidth < widths[i]) {
254 MaxWidth = widths[i];
255 }
256 if (MaxWidth > AvgWidth * 6 / 5) {
257 MaxWidth = AvgWidth * 6 / 5;
258 }
259 FX_Free(widths);
260 if (iMinWidth < MaxWidth) {
261 iMinWidth = MaxWidth;
262 }
263 }
264 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
265 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
266 pBaseLine->MergeBoxes();
267 }
268 if (m_bKeepColumn) {
269 FindColumns();
270 }
271 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
272 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
273 if (lastheight >= 0) {
274 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
275 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
276 lines.Add(L"");
277 }
278 }
279 lastheight = pBaseLine->m_MaxFontSizeV;
280 lastbaseline = pBaseLine->m_BaseLine;
281 CFX_WideString str;
282 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
283 lines.Add(str);
284 }
285 }
NormalizeCompositeChar(FX_WCHAR wChar,CFX_WideString & sDest)286 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest)
287 {
288 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
289 FX_LPWSTR pDst = NULL;
290 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
291 if (nCount < 1 ) {
292 sDest += wChar;
293 return;
294 }
295 pDst = new FX_WCHAR[nCount];
296 FX_Unicode_GetNormalization(wChar, pDst);
297 for (int nIndex = 0; nIndex < nCount; nIndex++) {
298 sDest += pDst[nIndex];
299 }
300 delete[] pDst;
301 }
NormalizeString(CFX_WideString & str)302 void NormalizeString(CFX_WideString& str)
303 {
304 if (str.GetLength() <= 0) {
305 return;
306 }
307 CFX_WideString sBuffer;
308 IFX_BidiChar* BidiChar = IFX_BidiChar::Create();
309 if (NULL == BidiChar) {
310 return;
311 }
312 CFX_WordArray order;
313 FX_BOOL bR2L = FALSE;
314 FX_INT32 start = 0, count = 0, i = 0;
315 int nR2L = 0, nL2R = 0;
316 for (i = 0; i < str.GetLength(); i++) {
317 if(BidiChar->AppendChar(str.GetAt(i))) {
318 FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
319 order.Add(start);
320 order.Add(count);
321 order.Add(ret);
322 if(!bR2L) {
323 if(ret == 2) {
324 nR2L++;
325 } else if (ret == 1) {
326 nL2R++;
327 }
328 }
329 }
330 }
331 if(BidiChar->EndChar()) {
332 FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
333 order.Add(start);
334 order.Add(count);
335 order.Add(ret);
336 if(!bR2L) {
337 if(ret == 2) {
338 nR2L++;
339 } else if(ret == 1) {
340 nL2R++;
341 }
342 }
343 }
344 if(nR2L > 0 && nR2L >= nL2R) {
345 bR2L = TRUE;
346 }
347 if(bR2L) {
348 int count = order.GetSize();
349 for(int j = count - 1; j > 0; j -= 3) {
350 int ret = order.GetAt(j);
351 int start = order.GetAt(j - 2);
352 int count1 = order.GetAt(j - 1);
353 if(ret == 2 || ret == 0) {
354 for(int i = start + count1 - 1; i >= start; i--) {
355 NormalizeCompositeChar(str[i], sBuffer);
356 }
357 } else {
358 i = j;
359 FX_BOOL bSymbol = FALSE;
360 while(i > 0 && order.GetAt(i) != 2) {
361 bSymbol = !order.GetAt(i);
362 i -= 3;
363 }
364 int end = start + count1 ;
365 int n = 0;
366 if(bSymbol) {
367 n = i + 6;
368 } else {
369 n = i + 3;
370 }
371 if(n >= j) {
372 for(int m = start; m < end; m++) {
373 sBuffer += str[m];
374 }
375 } else {
376 i = j;
377 j = n;
378 for(; n <= i; n += 3) {
379 int start = order.GetAt(n - 2);
380 int count1 = order.GetAt(n - 1);
381 int end = start + count1 ;
382 for(int m = start; m < end; m++) {
383 sBuffer += str[m];
384 }
385 }
386 }
387 }
388 }
389 } else {
390 int count = order.GetSize();
391 FX_BOOL bL2R = FALSE;
392 for(int j = 0; j < count; j += 3) {
393 int ret = order.GetAt(j + 2);
394 int start = order.GetAt(j);
395 int count1 = order.GetAt(j + 1);
396 if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
397 int i = j + 3;
398 while(bR2L && i < count) {
399 if(order.GetAt(i + 2) == 1) {
400 break;
401 } else {
402 i += 3;
403 }
404 }
405 if(i == 3) {
406 j = -3;
407 bL2R = TRUE;
408 continue;
409 }
410 int end = str.GetLength() - 1;
411 if(i < count) {
412 end = order.GetAt(i) - 1;
413 }
414 j = i - 3;
415 for(int n = end; n >= start; n--) {
416 NormalizeCompositeChar(str[i], sBuffer);
417 }
418 } else {
419 int end = start + count1 ;
420 for(int i = start; i < end; i++) {
421 sBuffer += str[i];
422 }
423 }
424 }
425 }
426 str.Empty();
427 str += sBuffer;
428 BidiChar->Release();
429 }
IsNumber(CFX_WideString & str)430 static FX_BOOL IsNumber(CFX_WideString& str)
431 {
432 for (int i = 0; i < str.GetLength(); i ++) {
433 FX_WCHAR ch = str[i];
434 if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && ch != ' ') {
435 return FALSE;
436 }
437 }
438 return TRUE;
439 }
FindColumns()440 void CTextPage::FindColumns()
441 {
442 int i;
443 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
444 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
445 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
446 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
447 CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
448 if (pColumn == NULL) {
449 pColumn = new CTextColumn;
450 pColumn->m_Count = 1;
451 pColumn->m_AvgPos = pTextBox->m_Right;
452 pColumn->m_TextPos = -1;
453 m_TextColumns.Add(pColumn);
454 } else {
455 pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
456 (pColumn->m_Count + 1);
457 pColumn->m_Count ++;
458 }
459 }
460 }
461 int mincount = m_BaseLines.GetSize() / 4;
462 for (i = 0; i < m_TextColumns.GetSize(); i ++) {
463 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
464 if (pTextColumn->m_Count >= mincount) {
465 continue;
466 }
467 delete pTextColumn;
468 m_TextColumns.RemoveAt(i);
469 i --;
470 }
471 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
472 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
473 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
474 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
475 if (IsNumber(pTextBox->m_Text)) {
476 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
477 }
478 }
479 }
480 }
FindColumn(FX_FLOAT xpos)481 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos)
482 {
483 for (int i = 0; i < m_TextColumns.GetSize(); i ++) {
484 CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i);
485 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
486 return pColumn;
487 }
488 }
489 return NULL;
490 }
BreakSpace(CPDF_TextObject * pTextObj)491 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj)
492 {
493 }
CTextBaseLine()494 CTextBaseLine::CTextBaseLine()
495 {
496 m_Top = -100000;
497 m_Bottom = 100000;
498 m_MaxFontSizeV = 0;
499 }
~CTextBaseLine()500 CTextBaseLine::~CTextBaseLine()
501 {
502 for (int i = 0; i < m_TextList.GetSize(); i ++) {
503 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
504 delete pText;
505 }
506 }
InsertTextBox(FX_FLOAT leftx,FX_FLOAT rightx,FX_FLOAT topy,FX_FLOAT bottomy,FX_FLOAT spacew,FX_FLOAT fontsize_v,const CFX_WideString & text)507 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy,
508 FX_FLOAT spacew, FX_FLOAT fontsize_v, const CFX_WideString& text)
509 {
510 if (m_Top < topy) {
511 m_Top = topy;
512 }
513 if (m_Bottom > bottomy) {
514 m_Bottom = bottomy;
515 }
516 if (m_MaxFontSizeV < fontsize_v) {
517 m_MaxFontSizeV = fontsize_v;
518 }
519 int i;
520 for (i = 0; i < m_TextList.GetSize(); i ++) {
521 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
522 if (pText->m_Left > leftx) {
523 break;
524 }
525 }
526 CTextBox* pText = new CTextBox;
527 pText->m_Text = text;
528 pText->m_Left = leftx;
529 pText->m_Right = rightx;
530 pText->m_Top = topy;
531 pText->m_Bottom = bottomy;
532 pText->m_SpaceWidth = spacew;
533 pText->m_FontSizeV = fontsize_v;
534 pText->m_pColumn = NULL;
535 m_TextList.InsertAt(i, pText);
536 }
537 FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT high2,
538 FX_FLOAT& interlow, FX_FLOAT& interhigh);
CanMerge(CTextBaseLine * pOther)539 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther)
540 {
541 FX_FLOAT inter_top, inter_bottom;
542 if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
543 inter_bottom, inter_top)) {
544 return FALSE;
545 }
546 FX_FLOAT inter_h = inter_top - inter_bottom;
547 if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
548 return FALSE;
549 }
550 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
551 for (int i = 0; i < m_TextList.GetSize(); i ++) {
552 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
553 for (int j = 0; j < pOther->m_TextList.GetSize(); j ++) {
554 CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j);
555 FX_FLOAT inter_left, inter_right;
556 if (!GetIntersection(pText->m_Left, pText->m_Right,
557 pOtherText->m_Left, pOtherText->m_Right, inter_left, inter_right)) {
558 continue;
559 }
560 FX_FLOAT inter_w = inter_right - inter_left;
561 if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_SpaceWidth / 2) {
562 continue;
563 }
564 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
565 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
566 return FALSE;
567 }
568 }
569 }
570 return TRUE;
571 }
Merge(CTextBaseLine * pOther)572 void CTextBaseLine::Merge(CTextBaseLine* pOther)
573 {
574 for (int i = 0; i < pOther->m_TextList.GetSize(); i ++) {
575 CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i);
576 InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom,
577 pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
578 }
579 }
GetWidth(FX_FLOAT & leftx,FX_FLOAT & rightx)580 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx)
581 {
582 int i;
583 for (i = 0; i < m_TextList.GetSize(); i ++) {
584 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
585 if (pText->m_Text != L" ") {
586 break;
587 }
588 }
589 if (i == m_TextList.GetSize()) {
590 return FALSE;
591 }
592 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
593 leftx = pText->m_Left;
594 for (i = m_TextList.GetSize() - 1; i >= 0; i --) {
595 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
596 if (pText->m_Text != L" ") {
597 break;
598 }
599 }
600 pText = (CTextBox*)m_TextList.GetAt(i);
601 rightx = pText->m_Right;
602 return TRUE;
603 }
MergeBoxes()604 void CTextBaseLine::MergeBoxes()
605 {
606 int i = 0;
607 while (1) {
608 if (i >= m_TextList.GetSize() - 1) {
609 break;
610 }
611 CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i);
612 CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1);
613 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
614 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ?
615 pNextText->m_SpaceWidth : pThisText->m_SpaceWidth;
616 if (spacew > 0.0 && dx < spacew * 2) {
617 pThisText->m_Right = pNextText->m_Right;
618 if (dx > spacew * 1.5) {
619 pThisText->m_Text += L" ";
620 } else if (dx > spacew / 3) {
621 pThisText->m_Text += L' ';
622 }
623 pThisText->m_Text += pNextText->m_Text;
624 pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ?
625 spacew : pNextText->m_SpaceWidth;
626 m_TextList.RemoveAt(i + 1);
627 delete pNextText;
628 } else {
629 i ++;
630 }
631 }
632 }
WriteOutput(CFX_WideString & str,FX_FLOAT leftx,FX_FLOAT pagewidth,int iTextWidth)633 void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pagewidth,
634 int iTextWidth)
635 {
636 int lastpos = -1;
637 for (int i = 0; i < m_TextList.GetSize(); i ++) {
638 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
639 int xpos;
640 if (pText->m_pColumn) {
641 xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + 0.5);
642 xpos -= pText->m_Text.GetLength();
643 } else {
644 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
645 }
646 if (xpos <= lastpos) {
647 xpos = lastpos + 1;
648 }
649 for (int j = lastpos + 1; j < xpos; j ++) {
650 str += ' ';
651 }
652 CFX_WideString sSrc(pText->m_Text);
653 NormalizeString(sSrc);
654 str += sSrc;
655 str += ' ';
656 lastpos = xpos + pText->m_Text.GetLength();
657 }
658 }
CountChars(int & count,FX_FLOAT & width,int & minchars)659 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars)
660 {
661 minchars = 0;
662 for (int i = 0; i < m_TextList.GetSize(); i ++) {
663 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
664 if (pText->m_Right - pText->m_Left < 0.002) {
665 continue;
666 }
667 count += pText->m_Text.GetLength();
668 width += pText->m_Right - pText->m_Left;
669 minchars += pText->m_Text.GetLength() + 1;
670 }
671 }
672 #define PI 3.1415926535897932384626433832795
CheckRotate(CPDF_Page & page,CFX_FloatRect & page_bbox)673 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox)
674 {
675 int total_count = 0, rotated_count[3] = {0, 0, 0};
676 FX_POSITION pos = page.GetFirstObjectPosition();
677 while (pos) {
678 CPDF_PageObject* pObj = page.GetNextObject(pos);
679 if (pObj->m_Type != PDFPAGE_TEXT) {
680 continue;
681 }
682 total_count ++;
683 CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
684 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
685 if (angle == 0.0) {
686 continue;
687 }
688 int degree = (int)(angle * 180 / PI + 0.5);
689 if (degree % 90) {
690 continue;
691 }
692 if (degree < 0) {
693 degree += 360;
694 }
695 int index = degree / 90 % 3 - 1;
696 if (index < 0) {
697 continue;
698 }
699 rotated_count[index] ++;
700 }
701 if (total_count == 0) {
702 return;
703 }
704 CFX_AffineMatrix matrix;
705 if (rotated_count[0] > total_count * 2 / 3) {
706 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
707 } else if (rotated_count[1] > total_count * 2 / 3) {
708 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
709 } else if (rotated_count[2] > total_count * 2 / 3) {
710 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
711 } else {
712 return;
713 }
714 page.Transform(matrix);
715 page_bbox.Transform(&matrix);
716 }
PDF_GetPageText_Unicode(CFX_WideStringArray & lines,CPDF_Document * pDoc,CPDF_Dictionary * pPage,int iMinWidth,FX_DWORD flags)717 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
718 int iMinWidth, FX_DWORD flags)
719 {
720 lines.RemoveAll();
721 if (pPage == NULL) {
722 return;
723 }
724 CPDF_Page page;
725 page.Load(pDoc, pPage);
726 CPDF_ParseOptions options;
727 options.m_bTextOnly = TRUE;
728 options.m_bSeparateForm = FALSE;
729 page.ParseContent(&options);
730 CFX_FloatRect page_bbox = page.GetPageBBox();
731 if (flags & PDF2TXT_AUTO_ROTATE) {
732 CheckRotate(page, page_bbox);
733 }
734 CTextPage texts;
735 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
736 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
737 texts.m_bBreakSpace = TRUE;
738 FX_POSITION pos = page.GetFirstObjectPosition();
739 while (pos) {
740 CPDF_PageObject* pObject = page.GetNextObject(pos);
741 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
742 CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top);
743 if (!page_bbox.Contains(rect)) {
744 continue;
745 }
746 }
747 texts.ProcessObject(pObject);
748 }
749 texts.WriteOutput(lines, iMinWidth);
750 }
PDF_GetPageText(CFX_ByteStringArray & lines,CPDF_Document * pDoc,CPDF_Dictionary * pPage,int iMinWidth,FX_DWORD flags)751 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
752 int iMinWidth, FX_DWORD flags)
753 {
754 lines.RemoveAll();
755 CFX_WideStringArray wlines;
756 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
757 for (int i = 0; i < wlines.GetSize(); i ++) {
758 CFX_WideString wstr = wlines[i];
759 CFX_ByteString str;
760 for (int c = 0; c < wstr.GetLength(); c ++) {
761 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
762 }
763 lines.Add(str);
764 }
765 }
766 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF,
767 CFX_PtrArray* pObjArray);
PDF_GetTextStream_Unicode(CFX_WideTextBuf & buffer,CPDF_Document * pDoc,CPDF_Dictionary * pPage,FX_DWORD flags)768 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, FX_DWORD flags)
769 {
770 buffer.EstimateSize(0, 10240);
771 CPDF_Page page;
772 page.Load(pDoc, pPage);
773 CPDF_ParseOptions options;
774 options.m_bTextOnly = TRUE;
775 options.m_bSeparateForm = FALSE;
776 page.ParseContent(&options);
777 _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL);
778 }
779