1#!/usr/bin/python -u 2# 3# Original script modified in November 2003 to take advantage of 4# the character-validation range routines, and updated to the 5# current Unicode information (Version 4.0.1) 6# 7# NOTE: there is an 'alias' facility for blocks which are not present in 8# the current release, but are needed for ABI compatibility. This 9# must be accomplished MANUALLY! Please see the comments below under 10# 'blockAliases' 11# 12import sys 13import string 14import time 15 16webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html" 17sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt" 18 19# 20# blockAliases is a small hack - it is used for mapping block names which 21# were were used in the 3.1 release, but are missing or changed in the current 22# release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]" 23blockAliases = [] 24blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols") 25blockAliases.append("Greek:GreekandCoptic") 26blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 27 "SupplementaryPrivateUseArea-B") 28 29# minTableSize gives the minimum number of ranges which must be present 30# before a range table is produced. If there are less than this 31# number, inline comparisons are generated 32minTableSize = 8 33 34(blockfile, catfile) = string.split(sources) 35 36 37# 38# Now process the "blocks" file, reducing it to a dictionary 39# indexed by blockname, containing a tuple with the applicable 40# block range 41# 42BlockNames = {} 43try: 44 blocks = open(blockfile, "r") 45except: 46 print "Missing %s, aborting ..." % blockfile 47 sys.exit(1) 48 49for line in blocks.readlines(): 50 if line[0] == '#': 51 continue 52 line = string.strip(line) 53 if line == '': 54 continue 55 try: 56 fields = string.split(line, ';') 57 range = string.strip(fields[0]) 58 (start, end) = string.split(range, "..") 59 name = string.strip(fields[1]) 60 name = string.replace(name, ' ', '') 61 except: 62 print "Failed to process line: %s" % (line) 63 continue 64 start = "0x" + start 65 end = "0x" + end 66 try: 67 BlockNames[name].append((start, end)) 68 except: 69 BlockNames[name] = [(start, end)] 70blocks.close() 71print "Parsed %d blocks descriptions" % (len(BlockNames.keys())) 72 73for block in blockAliases: 74 alias = string.split(block,':') 75 alist = string.split(alias[1],',') 76 for comp in alist: 77 if BlockNames.has_key(comp): 78 if alias[0] not in BlockNames: 79 BlockNames[alias[0]] = [] 80 for r in BlockNames[comp]: 81 BlockNames[alias[0]].append(r) 82 else: 83 print "Alias %s: %s not in Blocks" % (alias[0], comp) 84 continue 85 86# 87# Next process the Categories file. This is more complex, since 88# the file is in code sequence, and we need to invert it. We use 89# a dictionary with index category-name, with each entry containing 90# all the ranges (codepoints) of that category. Note that category 91# names comprise two parts - the general category, and the "subclass" 92# within that category. Therefore, both "general category" (which is 93# the first character of the 2-character category-name) and the full 94# (2-character) name are entered into this dictionary. 95# 96try: 97 data = open(catfile, "r") 98except: 99 print "Missing %s, aborting ..." % catfile 100 sys.exit(1) 101 102nbchar = 0; 103Categories = {} 104for line in data.readlines(): 105 if line[0] == '#': 106 continue 107 line = string.strip(line) 108 if line == '': 109 continue 110 try: 111 fields = string.split(line, ';') 112 point = string.strip(fields[0]) 113 value = 0 114 while point != '': 115 value = value * 16 116 if point[0] >= '0' and point[0] <= '9': 117 value = value + ord(point[0]) - ord('0') 118 elif point[0] >= 'A' and point[0] <= 'F': 119 value = value + 10 + ord(point[0]) - ord('A') 120 elif point[0] >= 'a' and point[0] <= 'f': 121 value = value + 10 + ord(point[0]) - ord('a') 122 point = point[1:] 123 name = fields[2] 124 except: 125 print "Failed to process line: %s" % (line) 126 continue 127 128 nbchar = nbchar + 1 129 # update entry for "full name" 130 try: 131 Categories[name].append(value) 132 except: 133 try: 134 Categories[name] = [value] 135 except: 136 print "Failed to process line: %s" % (line) 137 # update "general category" name 138 try: 139 Categories[name[0]].append(value) 140 except: 141 try: 142 Categories[name[0]] = [value] 143 except: 144 print "Failed to process line: %s" % (line) 145 146blocks.close() 147print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())) 148 149# 150# The data is now all read. Time to process it into a more useful form. 151# 152# reduce the number list into ranges 153for cat in Categories.keys(): 154 list = Categories[cat] 155 start = -1 156 prev = -1 157 end = -1 158 ranges = [] 159 for val in list: 160 if start == -1: 161 start = val 162 prev = val 163 continue 164 elif val == prev + 1: 165 prev = val 166 continue 167 elif prev == start: 168 ranges.append((prev, prev)) 169 start = val 170 prev = val 171 continue 172 else: 173 ranges.append((start, prev)) 174 start = val 175 prev = val 176 continue 177 if prev == start: 178 ranges.append((prev, prev)) 179 else: 180 ranges.append((start, prev)) 181 Categories[cat] = ranges 182 183# 184# Assure all data is in alphabetic order, since we will be doing binary 185# searches on the tables. 186# 187bkeys = BlockNames.keys() 188bkeys.sort() 189 190ckeys = Categories.keys() 191ckeys.sort() 192 193# 194# Generate the resulting files 195# 196try: 197 header = open("include/libxml/xmlunicode.h", "w") 198except: 199 print "Failed to open include/libxml/xmlunicode.h" 200 sys.exit(1) 201 202try: 203 output = open("xmlunicode.c", "w") 204except: 205 print "Failed to open xmlunicode.c" 206 sys.exit(1) 207 208date = time.asctime(time.localtime(time.time())) 209 210header.write( 211"""/* 212 * Summary: Unicode character APIs 213 * Description: API for the Unicode character APIs 214 * 215 * This file is automatically generated from the 216 * UCS description files of the Unicode Character Database 217 * %s 218 * using the genUnicode.py Python script. 219 * 220 * Generation date: %s 221 * Sources: %s 222 * Author: Daniel Veillard 223 */ 224 225#ifndef __XML_UNICODE_H__ 226#define __XML_UNICODE_H__ 227 228#include <libxml/xmlversion.h> 229 230#ifdef LIBXML_UNICODE_ENABLED 231 232#ifdef __cplusplus 233extern "C" { 234#endif 235 236""" % (webpage, date, sources)); 237 238output.write( 239"""/* 240 * xmlunicode.c: this module implements the Unicode character APIs 241 * 242 * This file is automatically generated from the 243 * UCS description files of the Unicode Character Database 244 * %s 245 * using the genUnicode.py Python script. 246 * 247 * Generation date: %s 248 * Sources: %s 249 * Daniel Veillard <veillard@redhat.com> 250 */ 251 252#define IN_LIBXML 253#include "libxml.h" 254 255#ifdef LIBXML_UNICODE_ENABLED 256 257#include <string.h> 258#include <libxml/xmlversion.h> 259#include <libxml/xmlunicode.h> 260#include <libxml/chvalid.h> 261 262typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */ 263 264typedef struct { 265 const char *rangename; 266 xmlIntFunc *func; 267} xmlUnicodeRange; 268 269typedef struct { 270 xmlUnicodeRange *table; 271 int numentries; 272} xmlUnicodeNameTable; 273 274 275static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname); 276 277static xmlUnicodeRange xmlUnicodeBlocks[] = { 278""" % (webpage, date, sources)); 279 280flag = 0 281for block in bkeys: 282 name = string.replace(block, '-', '') 283 if flag: 284 output.write(',\n') 285 else: 286 flag = 1 287 output.write(' {"%s", xmlUCSIs%s}' % (block, name)) 288output.write('};\n\n') 289 290output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n') 291flag = 0; 292for name in ckeys: 293 if flag: 294 output.write(',\n') 295 else: 296 flag = 1 297 output.write(' {"%s", xmlUCSIsCat%s}' % (name, name)) 298output.write('};\n\n') 299 300# 301# For any categories with more than minTableSize ranges we generate 302# a range table suitable for xmlCharInRange 303# 304for name in ckeys: 305 if len(Categories[name]) > minTableSize: 306 numshort = 0 307 numlong = 0 308 ranges = Categories[name] 309 sptr = "NULL" 310 lptr = "NULL" 311 for range in ranges: 312 (low, high) = range 313 if high < 0x10000: 314 if numshort == 0: 315 pline = "static const xmlChSRange xml%sS[] = {" % name 316 sptr = "xml%sS" % name 317 else: 318 pline += ", " 319 numshort += 1 320 else: 321 if numlong == 0: 322 if numshort > 0: 323 output.write(pline + " };\n") 324 pline = "static const xmlChLRange xml%sL[] = {" % name 325 lptr = "xml%sL" % name 326 else: 327 pline += ", " 328 numlong += 1 329 if len(pline) > 60: 330 output.write(pline + "\n") 331 pline = " " 332 pline += "{%s, %s}" % (hex(low), hex(high)) 333 output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n" 334 % (name, numshort, numlong, sptr, lptr)) 335 336 337output.write( 338"""static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s}; 339static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s}; 340 341/** 342 * xmlUnicodeLookup: 343 * @tptr: pointer to the name table 344 * @name: name to be found 345 * 346 * binary table lookup for user-supplied name 347 * 348 * Returns pointer to range function if found, otherwise NULL 349 */ 350static xmlIntFunc 351*xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) { 352 int low, high, mid, cmp; 353 xmlUnicodeRange *sptr; 354 355 if ((tptr == NULL) || (tname == NULL)) return(NULL); 356 357 low = 0; 358 high = tptr->numentries - 1; 359 sptr = tptr->table; 360 while (low <= high) { 361 mid = (low + high) / 2; 362 if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0) 363 return (sptr[mid].func); 364 if (cmp < 0) 365 high = mid - 1; 366 else 367 low = mid + 1; 368 } 369 return (NULL); 370} 371 372""" % (len(BlockNames), len(Categories)) ) 373 374for block in bkeys: 375 name = string.replace(block, '-', '') 376 header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name) 377 output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name)) 378 output.write(" *\n * Check whether the character is part of %s UCS Block\n"% 379 (block)) 380 output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); 381 output.write("int\nxmlUCSIs%s(int code) {\n return(" % name) 382 flag = 0 383 for (start, end) in BlockNames[block]: 384 if flag: 385 output.write(" ||\n ") 386 else: 387 flag = 1 388 output.write("((code >= %s) && (code <= %s))" % (start, end)) 389 output.write(");\n}\n\n") 390 391header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n") 392output.write( 393"""/** 394 * xmlUCSIsBlock: 395 * @code: UCS code point 396 * @block: UCS block name 397 * 398 * Check whether the character is part of the UCS Block 399 * 400 * Returns 1 if true, 0 if false and -1 on unknown block 401 */ 402int 403xmlUCSIsBlock(int code, const char *block) { 404 xmlIntFunc *func; 405 406 func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block); 407 if (func == NULL) 408 return (-1); 409 return (func(code)); 410} 411 412""") 413 414for name in ckeys: 415 ranges = Categories[name] 416 header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name) 417 output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name)) 418 output.write(" *\n * Check whether the character is part of %s UCS Category\n"% 419 (name)) 420 output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); 421 output.write("int\nxmlUCSIsCat%s(int code) {\n" % name) 422 if len(Categories[name]) > minTableSize: 423 output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)" 424 % name) 425 else: 426 start = 1 427 for range in ranges: 428 (begin, end) = range; 429 if start: 430 output.write(" return("); 431 start = 0 432 else: 433 output.write(" ||\n "); 434 if (begin == end): 435 output.write("(code == %s)" % (hex(begin))) 436 else: 437 output.write("((code >= %s) && (code <= %s))" % ( 438 hex(begin), hex(end))) 439 output.write(");\n}\n\n") 440 441header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n") 442output.write( 443"""/** 444 * xmlUCSIsCat: 445 * @code: UCS code point 446 * @cat: UCS Category name 447 * 448 * Check whether the character is part of the UCS Category 449 * 450 * Returns 1 if true, 0 if false and -1 on unknown category 451 */ 452int 453xmlUCSIsCat(int code, const char *cat) { 454 xmlIntFunc *func; 455 456 func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat); 457 if (func == NULL) 458 return (-1); 459 return (func(code)); 460} 461 462#define bottom_xmlunicode 463#include "elfgcchack.h" 464#endif /* LIBXML_UNICODE_ENABLED */ 465""") 466 467header.write(""" 468#ifdef __cplusplus 469} 470#endif 471 472#endif /* LIBXML_UNICODE_ENABLED */ 473 474#endif /* __XML_UNICODE_H__ */ 475"""); 476 477header.close() 478output.close() 479