1 // Copyright 2017 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/font/cpdf_cmap.h"
8 
9 #include <memory>
10 #include <utility>
11 #include <vector>
12 
13 #include "core/fpdfapi/cmaps/cmap_int.h"
14 #include "core/fpdfapi/font/cpdf_cmapmanager.h"
15 #include "core/fpdfapi/font/cpdf_cmapparser.h"
16 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
17 
18 namespace {
19 
20 struct ByteRange {
21   uint8_t m_First;
22   uint8_t m_Last;  // Inclusive.
23 };
24 
25 struct PredefinedCMap {
26   const char* m_pName;
27   CIDSet m_Charset;
28   CIDCoding m_Coding;
29   CPDF_CMap::CodingScheme m_CodingScheme;
30   uint8_t m_LeadingSegCount;
31   ByteRange m_LeadingSegs[2];
32 };
33 
34 const PredefinedCMap g_PredefinedCMaps[] = {
35     {"GB-EUC",
36      CIDSET_GB1,
37      CIDCODING_GB,
38      CPDF_CMap::MixedTwoBytes,
39      1,
40      {{0xa1, 0xfe}}},
41     {"GBpc-EUC",
42      CIDSET_GB1,
43      CIDCODING_GB,
44      CPDF_CMap::MixedTwoBytes,
45      1,
46      {{0xa1, 0xfc}}},
47     {"GBK-EUC",
48      CIDSET_GB1,
49      CIDCODING_GB,
50      CPDF_CMap::MixedTwoBytes,
51      1,
52      {{0x81, 0xfe}}},
53     {"GBKp-EUC",
54      CIDSET_GB1,
55      CIDCODING_GB,
56      CPDF_CMap::MixedTwoBytes,
57      1,
58      {{0x81, 0xfe}}},
59     {"GBK2K-EUC",
60      CIDSET_GB1,
61      CIDCODING_GB,
62      CPDF_CMap::MixedTwoBytes,
63      1,
64      {{0x81, 0xfe}}},
65     {"GBK2K",
66      CIDSET_GB1,
67      CIDCODING_GB,
68      CPDF_CMap::MixedTwoBytes,
69      1,
70      {{0x81, 0xfe}}},
71     {"UniGB-UCS2", CIDSET_GB1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
72     {"UniGB-UTF16", CIDSET_GB1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
73     {"B5pc",
74      CIDSET_CNS1,
75      CIDCODING_BIG5,
76      CPDF_CMap::MixedTwoBytes,
77      1,
78      {{0xa1, 0xfc}}},
79     {"HKscs-B5",
80      CIDSET_CNS1,
81      CIDCODING_BIG5,
82      CPDF_CMap::MixedTwoBytes,
83      1,
84      {{0x88, 0xfe}}},
85     {"ETen-B5",
86      CIDSET_CNS1,
87      CIDCODING_BIG5,
88      CPDF_CMap::MixedTwoBytes,
89      1,
90      {{0xa1, 0xfe}}},
91     {"ETenms-B5",
92      CIDSET_CNS1,
93      CIDCODING_BIG5,
94      CPDF_CMap::MixedTwoBytes,
95      1,
96      {{0xa1, 0xfe}}},
97     {"UniCNS-UCS2", CIDSET_CNS1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
98     {"UniCNS-UTF16", CIDSET_CNS1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
99     {"83pv-RKSJ",
100      CIDSET_JAPAN1,
101      CIDCODING_JIS,
102      CPDF_CMap::MixedTwoBytes,
103      2,
104      {{0x81, 0x9f}, {0xe0, 0xfc}}},
105     {"90ms-RKSJ",
106      CIDSET_JAPAN1,
107      CIDCODING_JIS,
108      CPDF_CMap::MixedTwoBytes,
109      2,
110      {{0x81, 0x9f}, {0xe0, 0xfc}}},
111     {"90msp-RKSJ",
112      CIDSET_JAPAN1,
113      CIDCODING_JIS,
114      CPDF_CMap::MixedTwoBytes,
115      2,
116      {{0x81, 0x9f}, {0xe0, 0xfc}}},
117     {"90pv-RKSJ",
118      CIDSET_JAPAN1,
119      CIDCODING_JIS,
120      CPDF_CMap::MixedTwoBytes,
121      2,
122      {{0x81, 0x9f}, {0xe0, 0xfc}}},
123     {"Add-RKSJ",
124      CIDSET_JAPAN1,
125      CIDCODING_JIS,
126      CPDF_CMap::MixedTwoBytes,
127      2,
128      {{0x81, 0x9f}, {0xe0, 0xfc}}},
129     {"EUC",
130      CIDSET_JAPAN1,
131      CIDCODING_JIS,
132      CPDF_CMap::MixedTwoBytes,
133      2,
134      {{0x8e, 0x8e}, {0xa1, 0xfe}}},
135     {"H", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
136     {"V", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
137     {"Ext-RKSJ",
138      CIDSET_JAPAN1,
139      CIDCODING_JIS,
140      CPDF_CMap::MixedTwoBytes,
141      2,
142      {{0x81, 0x9f}, {0xe0, 0xfc}}},
143     {"UniJIS-UCS2", CIDSET_JAPAN1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
144     {"UniJIS-UCS2-HW",
145      CIDSET_JAPAN1,
146      CIDCODING_UCS2,
147      CPDF_CMap::TwoBytes,
148      0,
149      {}},
150     {"UniJIS-UTF16",
151      CIDSET_JAPAN1,
152      CIDCODING_UTF16,
153      CPDF_CMap::TwoBytes,
154      0,
155      {}},
156     {"KSC-EUC",
157      CIDSET_KOREA1,
158      CIDCODING_KOREA,
159      CPDF_CMap::MixedTwoBytes,
160      1,
161      {{0xa1, 0xfe}}},
162     {"KSCms-UHC",
163      CIDSET_KOREA1,
164      CIDCODING_KOREA,
165      CPDF_CMap::MixedTwoBytes,
166      1,
167      {{0x81, 0xfe}}},
168     {"KSCms-UHC-HW",
169      CIDSET_KOREA1,
170      CIDCODING_KOREA,
171      CPDF_CMap::MixedTwoBytes,
172      1,
173      {{0x81, 0xfe}}},
174     {"KSCpc-EUC",
175      CIDSET_KOREA1,
176      CIDCODING_KOREA,
177      CPDF_CMap::MixedTwoBytes,
178      1,
179      {{0xa1, 0xfd}}},
180     {"UniKS-UCS2", CIDSET_KOREA1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
181     {"UniKS-UTF16", CIDSET_KOREA1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
182 };
183 
CheckFourByteCodeRange(uint8_t * codes,size_t size,const std::vector<CPDF_CMap::CodeRange> & ranges)184 int CheckFourByteCodeRange(uint8_t* codes,
185                            size_t size,
186                            const std::vector<CPDF_CMap::CodeRange>& ranges) {
187   for (size_t i = ranges.size(); i > 0; i--) {
188     size_t seg = i - 1;
189     if (ranges[seg].m_CharSize < size)
190       continue;
191     size_t iChar = 0;
192     while (iChar < size) {
193       if (codes[iChar] < ranges[seg].m_Lower[iChar] ||
194           codes[iChar] > ranges[seg].m_Upper[iChar]) {
195         break;
196       }
197       ++iChar;
198     }
199     if (iChar == ranges[seg].m_CharSize)
200       return 2;
201     if (iChar)
202       return (size == ranges[seg].m_CharSize) ? 2 : 1;
203   }
204   return 0;
205 }
206 
GetFourByteCharSizeImpl(uint32_t charcode,const std::vector<CPDF_CMap::CodeRange> & ranges)207 size_t GetFourByteCharSizeImpl(
208     uint32_t charcode,
209     const std::vector<CPDF_CMap::CodeRange>& ranges) {
210   if (ranges.empty())
211     return 1;
212 
213   uint8_t codes[4];
214   codes[0] = codes[1] = 0x00;
215   codes[2] = static_cast<uint8_t>(charcode >> 8 & 0xFF);
216   codes[3] = static_cast<uint8_t>(charcode);
217   for (size_t offset = 0; offset < 4; offset++) {
218     size_t size = 4 - offset;
219     for (size_t j = 0; j < ranges.size(); j++) {
220       size_t iSeg = (ranges.size() - 1) - j;
221       if (ranges[iSeg].m_CharSize < size)
222         continue;
223       size_t iChar = 0;
224       while (iChar < size) {
225         if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] ||
226             codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
227           break;
228         }
229         ++iChar;
230       }
231       if (iChar == ranges[iSeg].m_CharSize)
232         return size;
233     }
234   }
235   return 1;
236 }
237 
238 }  // namespace
239 
CPDF_CMap()240 CPDF_CMap::CPDF_CMap()
241     : m_bLoaded(false),
242       m_bVertical(false),
243       m_Charset(CIDSET_UNKNOWN),
244       m_CodingScheme(TwoBytes),
245       m_Coding(CIDCODING_UNKNOWN),
246       m_pEmbedMap(nullptr) {}
247 
~CPDF_CMap()248 CPDF_CMap::~CPDF_CMap() {}
249 
LoadPredefined(CPDF_CMapManager * pMgr,const ByteString & bsName,bool bPromptCJK)250 void CPDF_CMap::LoadPredefined(CPDF_CMapManager* pMgr,
251                                const ByteString& bsName,
252                                bool bPromptCJK) {
253   m_PredefinedCMap = bsName;
254   if (m_PredefinedCMap == "Identity-H" || m_PredefinedCMap == "Identity-V") {
255     m_Coding = CIDCODING_CID;
256     m_bVertical = bsName.Last() == 'V';
257     m_bLoaded = true;
258     return;
259   }
260   ByteString cmapid = m_PredefinedCMap;
261   m_bVertical = cmapid.Last() == 'V';
262   if (cmapid.GetLength() > 2) {
263     cmapid = cmapid.Left(cmapid.GetLength() - 2);
264   }
265   const PredefinedCMap* map = nullptr;
266   for (size_t i = 0; i < FX_ArraySize(g_PredefinedCMaps); ++i) {
267     if (cmapid == ByteStringView(g_PredefinedCMaps[i].m_pName)) {
268       map = &g_PredefinedCMaps[i];
269       break;
270     }
271   }
272   if (!map)
273     return;
274 
275   m_Charset = map->m_Charset;
276   m_Coding = map->m_Coding;
277   m_CodingScheme = map->m_CodingScheme;
278   if (m_CodingScheme == MixedTwoBytes) {
279     m_MixedTwoByteLeadingBytes = std::vector<bool>(256);
280     for (uint32_t i = 0; i < map->m_LeadingSegCount; ++i) {
281       const ByteRange& seg = map->m_LeadingSegs[i];
282       for (int b = seg.m_First; b <= seg.m_Last; ++b)
283         m_MixedTwoByteLeadingBytes[b] = true;
284     }
285   }
286   m_pEmbedMap = FPDFAPI_FindEmbeddedCMap(bsName, m_Charset, m_Coding);
287   if (!m_pEmbedMap)
288     return;
289 
290   m_bLoaded = true;
291 }
292 
LoadEmbedded(const uint8_t * pData,uint32_t size)293 void CPDF_CMap::LoadEmbedded(const uint8_t* pData, uint32_t size) {
294   m_DirectCharcodeToCIDTable = std::vector<uint16_t>(65536);
295   CPDF_CMapParser parser(this);
296   CPDF_SimpleParser syntax(pData, size);
297   while (1) {
298     ByteStringView word = syntax.GetWord();
299     if (word.IsEmpty()) {
300       break;
301     }
302     parser.ParseWord(word);
303   }
304   if (m_CodingScheme == MixedFourBytes && parser.HasAdditionalMappings()) {
305     m_AdditionalCharcodeToCIDMappings = parser.TakeAdditionalMappings();
306     std::sort(
307         m_AdditionalCharcodeToCIDMappings.begin(),
308         m_AdditionalCharcodeToCIDMappings.end(),
309         [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
310           return arg1.m_EndCode < arg2.m_EndCode;
311         });
312   }
313 }
314 
CIDFromCharCode(uint32_t charcode) const315 uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
316   if (m_Coding == CIDCODING_CID)
317     return static_cast<uint16_t>(charcode);
318 
319   if (m_pEmbedMap)
320     return FPDFAPI_CIDFromCharCode(m_pEmbedMap, charcode);
321 
322   if (m_DirectCharcodeToCIDTable.empty())
323     return static_cast<uint16_t>(charcode);
324 
325   if (charcode < 0x10000)
326     return m_DirectCharcodeToCIDTable[charcode];
327 
328   auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
329                              m_AdditionalCharcodeToCIDMappings.end(), charcode,
330                              [](const CPDF_CMap::CIDRange& arg, uint32_t val) {
331                                return arg.m_EndCode < val;
332                              });
333   if (it == m_AdditionalCharcodeToCIDMappings.end() ||
334       it->m_StartCode > charcode) {
335     return 0;
336   }
337   return it->m_StartCID + charcode - it->m_StartCode;
338 }
339 
GetNextChar(const char * pString,int nStrLen,int & offset) const340 uint32_t CPDF_CMap::GetNextChar(const char* pString,
341                                 int nStrLen,
342                                 int& offset) const {
343   auto* pBytes = reinterpret_cast<const uint8_t*>(pString);
344   switch (m_CodingScheme) {
345     case OneByte: {
346       return pBytes[offset++];
347     }
348     case TwoBytes: {
349       uint8_t byte1 = pBytes[offset++];
350       return 256 * byte1 + pBytes[offset++];
351     }
352     case MixedTwoBytes: {
353       uint8_t byte1 = pBytes[offset++];
354       if (!m_MixedTwoByteLeadingBytes[byte1])
355         return byte1;
356       return 256 * byte1 + pBytes[offset++];
357     }
358     case MixedFourBytes: {
359       uint8_t codes[4];
360       int char_size = 1;
361       codes[0] = pBytes[offset++];
362       while (1) {
363         int ret = CheckFourByteCodeRange(codes, char_size,
364                                          m_MixedFourByteLeadingRanges);
365         if (ret == 0)
366           return 0;
367         if (ret == 2) {
368           uint32_t charcode = 0;
369           for (int i = 0; i < char_size; i++)
370             charcode = (charcode << 8) + codes[i];
371           return charcode;
372         }
373         if (char_size == 4 || offset == nStrLen)
374           return 0;
375         codes[char_size++] = pBytes[offset++];
376       }
377       break;
378     }
379   }
380   return 0;
381 }
382 
GetCharSize(uint32_t charcode) const383 int CPDF_CMap::GetCharSize(uint32_t charcode) const {
384   switch (m_CodingScheme) {
385     case OneByte:
386       return 1;
387     case TwoBytes:
388       return 2;
389     case MixedTwoBytes:
390       if (charcode < 0x100)
391         return 1;
392       return 2;
393     case MixedFourBytes:
394       if (charcode < 0x100)
395         return 1;
396       if (charcode < 0x10000)
397         return 2;
398       if (charcode < 0x1000000)
399         return 3;
400       return 4;
401   }
402   return 1;
403 }
404 
CountChar(const char * pString,int size) const405 int CPDF_CMap::CountChar(const char* pString, int size) const {
406   switch (m_CodingScheme) {
407     case OneByte:
408       return size;
409     case TwoBytes:
410       return (size + 1) / 2;
411     case MixedTwoBytes: {
412       int count = 0;
413       for (int i = 0; i < size; i++) {
414         count++;
415         if (m_MixedTwoByteLeadingBytes[reinterpret_cast<const uint8_t*>(
416                 pString)[i]]) {
417           i++;
418         }
419       }
420       return count;
421     }
422     case MixedFourBytes: {
423       int count = 0, offset = 0;
424       while (offset < size) {
425         GetNextChar(pString, size, offset);
426         count++;
427       }
428       return count;
429     }
430   }
431   return size;
432 }
433 
AppendChar(char * str,uint32_t charcode) const434 int CPDF_CMap::AppendChar(char* str, uint32_t charcode) const {
435   switch (m_CodingScheme) {
436     case OneByte:
437       str[0] = static_cast<char>(charcode);
438       return 1;
439     case TwoBytes:
440       str[0] = static_cast<char>(charcode / 256);
441       str[1] = static_cast<char>(charcode % 256);
442       return 2;
443     case MixedTwoBytes:
444       if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[charcode]) {
445         str[0] = static_cast<char>(charcode);
446         return 1;
447       }
448       str[0] = static_cast<char>(charcode >> 8);
449       str[1] = static_cast<char>(charcode);
450       return 2;
451     case MixedFourBytes:
452       if (charcode < 0x100) {
453         int iSize = static_cast<int>(
454             GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges));
455         if (iSize == 0)
456           iSize = 1;
457         str[iSize - 1] = static_cast<char>(charcode);
458         if (iSize > 1)
459           memset(str, 0, iSize - 1);
460         return iSize;
461       }
462       if (charcode < 0x10000) {
463         str[0] = static_cast<char>(charcode >> 8);
464         str[1] = static_cast<char>(charcode);
465         return 2;
466       }
467       if (charcode < 0x1000000) {
468         str[0] = static_cast<char>(charcode >> 16);
469         str[1] = static_cast<char>(charcode >> 8);
470         str[2] = static_cast<char>(charcode);
471         return 3;
472       }
473       str[0] = static_cast<char>(charcode >> 24);
474       str[1] = static_cast<char>(charcode >> 16);
475       str[2] = static_cast<char>(charcode >> 8);
476       str[3] = static_cast<char>(charcode);
477       return 4;
478   }
479   return 0;
480 }
481