1#!/usr/bin/python -u
2#
3# Portions of this script have been (shamelessly) stolen from the
4# prior work of Daniel Veillard (genUnicode.py)
5#
6# I, however, take full credit for any bugs, errors or difficulties :-)
7#
8# William Brack
9# October 2003
10#
11# 18 October 2003
12# Modified to maintain binary compatibility with previous library versions
13# by adding a suffix 'Q' ('quick') to the macro generated for the original,
14# function, and adding generation of a function (with the original name) which
15# instantiates the macro.
16#
17
18import sys
19import string
20import time
21
22#
23# A routine to take a list of yes/no (1, 0) values and turn it
24# into a list of ranges.  This will later be used to determine whether
25# to generate single-byte lookup tables, or inline comparisons
26#
27def makeRange(lst):
28    ret = []
29    pos = 0
30    while pos < len(lst):
31	try:		# index generates exception if not present
32	    s = lst[pos:].index(1)	# look for start of next range
33	except:
34	    break			# if no more, finished
35	pos += s			# pointer to start of possible range
36	try:
37	    e = lst[pos:].index(0)	# look for end of range
38	    e += pos
39	except:				# if no end, set to end of list
40	    e = len(lst)
41	ret.append((pos, e-1))		# append range tuple to list
42	pos = e + 1			# ready to check for next range
43    return ret
44
45sources = "chvalid.def"			# input filename
46
47# minTableSize gives the minimum number of ranges which must be present
48# before a 256-byte lookup table is produced.  If there are less than this
49# number, a macro with inline comparisons is generated
50minTableSize = 6
51
52# dictionary of functions, key=name, element contains char-map and range-list
53Functs = {}
54
55state = 0
56
57try:
58    defines = open("chvalid.def", "r")
59except:
60    print "Missing chvalid.def, aborting ..."
61    sys.exit(1)
62
63#
64# The lines in the .def file have three types:-
65#   name:   Defines a new function block
66#   ur:	    Defines individual or ranges of unicode values
67#   end:    Indicates the end of the function block
68#
69# These lines are processed below.
70#
71for line in defines.readlines():
72    # ignore blank lines, or lines beginning with '#'
73    if line[0] == '#':
74        continue
75    line = string.strip(line)
76    if line == '':
77        continue
78    # split line into space-separated fields, then split on type
79    try:
80        fields = string.split(line, ' ')
81	#
82	# name line:
83	#   validate any previous function block already ended
84	#   validate this function not already defined
85	#   initialize an entry in the function dicitonary
86	#	including a mask table with no values yet defined
87	#
88	if fields[0] == 'name':
89	    name = fields[1]
90	    if state != 0:
91		print "'name' %s found before previous name" \
92		      "completed" % (fields[1])
93		continue
94	    state = 1
95	    if Functs.has_key(name):
96		print "name '%s' already present - may give" \
97		      " wrong results" % (name)
98	    else:
99		# dict entry with two list elements (chdata, rangedata)
100		Functs[name] = [ [], [] ]
101		for v in range(256):
102		    Functs[name][0].append(0)
103	#
104	# end line:
105	#   validate there was a preceding function name line
106	#   set state to show no current function active
107	#
108	elif fields[0] == 'end':
109	    if state == 0:
110		print "'end' found outside of function block"
111		continue
112	    state = 0
113
114	#
115	# ur line:
116	#   validate function has been defined
117	#   process remaining fields on the line, which may be either
118	#	individual unicode values or ranges of values
119	#
120	elif fields[0] == 'ur':
121	    if state != 1:
122		raise ValidationError, "'ur' found outside of 'name' block"
123	    for el in fields[1:]:
124		pos = string.find(el, '..')
125		# pos <=0 means not a range, so must be individual value
126		if pos <= 0:
127		    # cheap handling of hex or decimal values
128		    if el[0:2] == '0x':
129		        value = int(el[2:],16)
130		    elif el[0] == "'":
131			value = ord(el[1])
132		    else:
133			value = int(el)
134		    if ((value < 0) | (value > 0x1fffff)):
135			raise ValidationError, 'Illegal value (%s) in ch for'\
136				' name %s' % (el,name)
137		    # for ur we have only ranges (makes things simpler),
138		    # so convert val to range
139		    currange = (value, value)
140		# pos > 0 means this is a range, so isolate/validate
141		# the interval
142		else:
143		    # split the range into it's first-val, last-val
144		    (first, last) = string.split(el, "..")
145		    # convert values from text into binary
146		    if first[0:2] == '0x':
147			start = int(first[2:],16)
148		    elif first[0] == "'":
149			start = ord(first[1])
150		    else:
151			start = int(first)
152		    if last[0:2] == '0x':
153			end = int(last[2:],16)
154		    elif last[0] == "'":
155			end = ord(last[1])
156		    else:
157			end = int(last)
158		    if (start < 0) | (end > 0x1fffff) | (start > end):
159			raise ValidationError, "Invalid range '%s'" % el
160		    currange = (start, end)
161		# common path - 'currange' has the range, now take care of it
162		# We split on single-byte values vs. multibyte
163		if currange[1] < 0x100:	# single-byte
164		    for ch in range(currange[0],currange[1]+1):
165			# validate that value not previously defined
166			if Functs[name][0][ch]:
167			    msg = "Duplicate ch value '%s' for name '%s'" % (el, name)
168			    raise ValidationError, msg
169			Functs[name][0][ch] = 1
170		else:			# multi-byte
171		    if currange in Functs[name][1]:
172			raise ValidationError, "range already defined in" \
173				" function"
174		    else:
175			Functs[name][1].append(currange)
176
177    except:
178	print "Failed to process line: %s" % (line)
179	raise
180#
181# At this point, the entire definition file has been processed.  Now we
182# enter the output phase, where we generate the two files chvalid.c and'
183# chvalid.h
184#
185# To do this, we first output the 'static' data (heading, fixed
186# definitions, etc.), then output the 'dynamic' data (the results
187# of the above processing), and finally output closing 'static' data
188# (e.g. the subroutine to process the ranges)
189#
190
191#
192# Generate the headings:
193#
194try:
195    header = open("include/libxml/chvalid.h", "w")
196except:
197    print "Failed to open include/libxml/chvalid.h"
198    sys.exit(1)
199
200try:
201    output = open("chvalid.c", "w")
202except:
203    print "Failed to open chvalid.c"
204    sys.exit(1)
205
206date = time.asctime(time.localtime(time.time()))
207
208header.write(
209"""/*
210 * Summary: Unicode character range checking
211 * Description: this module exports interfaces for the character
212 *               range validation APIs
213 *
214 * This file is automatically generated from the cvs source
215 * definition files using the genChRanges.py Python script
216 *
217 * Generation date: %s
218 * Sources: %s
219 * Author: William Brack <wbrack@mmm.com.hk>
220 */
221
222#ifndef __XML_CHVALID_H__
223#define __XML_CHVALID_H__
224
225#include <libxml/xmlversion.h>
226#include <libxml/xmlstring.h>
227
228#ifdef __cplusplus
229extern "C" {
230#endif
231
232/*
233 * Define our typedefs and structures
234 *
235 */
236typedef struct _xmlChSRange xmlChSRange;
237typedef xmlChSRange *xmlChSRangePtr;
238struct _xmlChSRange {
239    unsigned short	low;
240    unsigned short	high;
241};
242
243typedef struct _xmlChLRange xmlChLRange;
244typedef xmlChLRange *xmlChLRangePtr;
245struct _xmlChLRange {
246    unsigned int	low;
247    unsigned int	high;
248};
249
250typedef struct _xmlChRangeGroup xmlChRangeGroup;
251typedef xmlChRangeGroup *xmlChRangeGroupPtr;
252struct _xmlChRangeGroup {
253    int			nbShortRange;
254    int			nbLongRange;
255    const xmlChSRange	*shortRange;	/* points to an array of ranges */
256    const xmlChLRange	*longRange;
257};
258
259/**
260 * Range checking routine
261 */
262XMLPUBFUN int XMLCALL
263		xmlCharInRange(unsigned int val, const xmlChRangeGroup *group);
264
265""" % (date, sources));
266output.write(
267"""/*
268 * chvalid.c:	this module implements the character range
269 *		validation APIs
270 *
271 * This file is automatically generated from the cvs source
272 * definition files using the genChRanges.py Python script
273 *
274 * Generation date: %s
275 * Sources: %s
276 * William Brack <wbrack@mmm.com.hk>
277 */
278
279#define IN_LIBXML
280#include "libxml.h"
281#include <libxml/chvalid.h>
282
283/*
284 * The initial tables ({func_name}_tab) are used to validate whether a
285 * single-byte character is within the specified group.  Each table
286 * contains 256 bytes, with each byte representing one of the 256
287 * possible characters.  If the table byte is set, the character is
288 * allowed.
289 *
290 */
291""" % (date, sources));
292
293#
294# Now output the generated data.
295# We try to produce the best execution times.  Tests have shown that validation
296# with direct table lookup is, when there are a "small" number of valid items,
297# still not as fast as a sequence of inline compares.  So, if the single-byte
298# portion of a range has a "small" number of ranges, we output a macro for inline
299# compares, otherwise we output a 256-byte table and a macro to use it.
300#
301
302fkeys = Functs.keys()	# Dictionary of all defined functions
303fkeys.sort()		# Put some order to our output
304
305for f in fkeys:
306
307# First we convert the specified single-byte values into a group of ranges.
308# If the total number of such ranges is less than minTableSize, we generate
309# an inline macro for direct comparisons; if greater, we generate a lookup
310# table.
311    if max(Functs[f][0]) > 0:	# only check if at least one entry
312        rangeTable = makeRange(Functs[f][0])
313	numRanges = len(rangeTable)
314	if numRanges >= minTableSize:	# table is worthwhile
315	    header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f)
316	    header.write("""
317/**
318 * %s_ch:
319 * @c: char to validate
320 *
321 * Automatically generated by genChRanges.py
322 */
323""" % f)
324	    header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))
325
326	    # write the constant data to the code file
327	    output.write("const unsigned char %s_tab[256] = {\n" % f)
328	    pline = "   "
329	    for n in range(255):
330		pline += " 0x%02x," % Functs[f][0][n]
331		if len(pline) > 72:
332		    output.write(pline + "\n")
333		    pline = "   "
334	    output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255])
335
336	else:		# inline check is used
337	    # first another little optimisation - if space is present,
338	    # put it at the front of the list so it is checked first
339	    try:
340		ix = rangeTable.remove((0x20, 0x20))
341		rangeTable.insert(0, (0x20, 0x20))
342	    except:
343		pass
344	    firstFlag = 1
345
346	    header.write("""
347/**
348 * %s_ch:
349 * @c: char to validate
350 *
351 * Automatically generated by genChRanges.py
352 */
353""" % f)
354	    # okay, I'm tired of the messy lineup - let's automate it!
355	    pline = "#define %s_ch(c)" % f
356	    # 'ntab' is number of tabs needed to position to col. 33 from name end
357	    ntab = 4 - (len(pline)) / 8
358	    if ntab < 0:
359		ntab = 0
360	    just = ""
361	    for i in range(ntab):
362		just += "\t"
363	    pline = pline + just + "("
364	    for rg in rangeTable:
365		if not firstFlag:
366		    pline += " || \\\n\t\t\t\t "
367		else:
368		    firstFlag = 0
369		if rg[0] == rg[1]:		# single value - check equal
370		    pline += "((c) == 0x%x)" % rg[0]
371		else:				# value range
372		# since we are doing char, also change range ending in 0xff
373		    if rg[1] != 0xff:
374		        pline += "((0x%x <= (c)) &&" % rg[0]
375		        pline += " ((c) <= 0x%x))" % rg[1]
376		    else:
377		        pline += " (0x%x <= (c))" % rg[0]
378	    pline += ")\n"
379	    header.write(pline)
380
381    header.write("""
382/**
383 * %sQ:
384 * @c: char to validate
385 *
386 * Automatically generated by genChRanges.py
387 */
388""" % f)
389    pline = "#define %sQ(c)" % f
390    ntab = 4 - (len(pline)) / 8
391    if ntab < 0:
392	ntab = 0
393    just = ""
394    for i in range(ntab):
395	just += "\t"
396    header.write(pline + just + "(((c) < 0x100) ? \\\n\t\t\t\t ")
397    if max(Functs[f][0]) > 0:
398	header.write("%s_ch((c)) :" % f)
399    else:
400	header.write("0 :")
401
402    # if no ranges defined, value invalid if >= 0x100
403    numRanges = len(Functs[f][1])
404    if numRanges == 0:
405	header.write(" 0)\n\n")
406    else:
407	if numRanges >= minTableSize:
408	    header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n"  % f)
409	else:		# if < minTableSize, generate inline code
410	    firstFlag = 1
411	    for rg in Functs[f][1]:
412		if not firstFlag:
413		    pline += " || \\\n\t\t\t\t "
414		else:
415		    firstFlag = 0
416		    pline = "\\\n\t\t\t\t("
417		if rg[0] == rg[1]:		# single value - check equal
418		    pline += "((c) == 0x%x)" % rg[0]
419		else:				# value range
420		    pline += "((0x%x <= (c)) &&" % rg[0]
421		    pline += " ((c) <= 0x%x))" % rg[1]
422	    pline += "))\n\n"
423	    header.write(pline)
424
425
426    if len(Functs[f][1]) > 0:
427	header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f)
428
429
430#
431# Next we do the unicode ranges
432#
433
434for f in fkeys:
435    if len(Functs[f][1]) > 0:	# only generate if unicode ranges present
436	rangeTable = Functs[f][1]
437	rangeTable.sort()	# ascending tuple sequence
438	numShort = 0
439	numLong  = 0
440	for rg in rangeTable:
441	    if rg[1] < 0x10000:	# if short value
442		if numShort == 0:	# first occurrence
443		    pline = "static const xmlChSRange %s_srng[] = { " % f
444		else:
445		    pline += ", "
446		numShort += 1
447		if len(pline) > 60:
448		    output.write(pline + "\n")
449		    pline = "    "
450		pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
451	    else:		# if long value
452		if numLong == 0:	# first occurrence
453		    if numShort > 0:	# if there were shorts, finish them off
454			output.write(pline + "};\n")
455		    pline = "static const xmlChLRange %s_lrng[] = { " % f
456		else:
457		    pline += ", "
458		numLong += 1
459		if len(pline) > 60:
460		    output.write(pline + "\n")
461		    pline = "    "
462		pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
463	output.write(pline + "};\n")	# finish off last group
464
465	pline = "const xmlChRangeGroup %sGroup =\n\t{%d, %d, " % (f, numShort, numLong)
466	if numShort > 0:
467	    pline += "%s_srng" % f
468	else:
469	    pline += "(xmlChSRangePtr)0"
470	if numLong > 0:
471	    pline += ", %s_lrng" % f
472	else:
473	    pline += ", (xmlChLRangePtr)0"
474
475	output.write(pline + "};\n\n")
476
477output.write(
478"""
479/**
480 * xmlCharInRange:
481 * @val: character to be validated
482 * @rptr: pointer to range to be used to validate
483 *
484 * Does a binary search of the range table to determine if char
485 * is valid
486 *
487 * Returns: true if character valid, false otherwise
488 */
489int
490xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) {
491    int low, high, mid;
492    const xmlChSRange *sptr;
493    const xmlChLRange *lptr;
494
495    if (rptr == NULL) return(0);
496    if (val < 0x10000) {	/* is val in 'short' or 'long'  array? */
497	if (rptr->nbShortRange == 0)
498	    return 0;
499	low = 0;
500	high = rptr->nbShortRange - 1;
501	sptr = rptr->shortRange;
502	while (low <= high) {
503	    mid = (low + high) / 2;
504	    if ((unsigned short) val < sptr[mid].low) {
505		high = mid - 1;
506	    } else {
507	        if ((unsigned short) val > sptr[mid].high) {
508		    low = mid + 1;
509		} else {
510		    return 1;
511		}
512	    }
513	}
514    } else {
515	if (rptr->nbLongRange == 0) {
516	    return 0;
517	}
518	low = 0;
519	high = rptr->nbLongRange - 1;
520	lptr = rptr->longRange;
521	while (low <= high) {
522	    mid = (low + high) / 2;
523	    if (val < lptr[mid].low) {
524		high = mid - 1;
525	    } else {
526	        if (val > lptr[mid].high) {
527		    low = mid + 1;
528		} else {
529		    return 1;
530		}
531	    }
532	}
533    }
534    return 0;
535}
536
537""");
538
539#
540# finally, generate the ABI compatibility functions
541#
542for f in fkeys:
543    output.write("""
544/**
545 * %s:
546 * @ch:  character to validate
547 *
548 * This function is DEPRECATED.
549""" % f);
550    if max(Functs[f][0]) > 0:
551        output.write(" * Use %s_ch or %sQ instead" % (f, f))
552    else:
553        output.write(" * Use %sQ instead" % f)
554    output.write("""
555 *
556 * Returns true if argument valid, false otherwise
557 */
558""")
559    output.write("int\n%s(unsigned int ch) {\n    return(%sQ(ch));\n}\n\n" % (f,f))
560    header.write("XMLPUBFUN int XMLCALL\n\t\t%s(unsigned int ch);\n" % f);
561#
562# Run complete - write trailers and close the output files
563#
564
565header.write("""
566#ifdef __cplusplus
567}
568#endif
569#endif /* __XML_CHVALID_H__ */
570""")
571
572header.close()
573
574output.write("""#define bottom_chvalid
575#include "elfgcchack.h"
576""")
577output.close()
578
579