1#!/usr/bin/python -u
2#
3# imports the API description and fills up a database with
4# name relevance to modules, functions or web pages
5#
6# Operation needed:
7# =================
8#
9# install mysqld, the python wrappers for mysql and libxml2, start mysqld
10# Change the root passwd of mysql:
11#    mysqladmin -u root password new_password
12# Create the new database xmlsoft
13#    mysqladmin -p create xmlsoft
14# Create a database user 'veillard' and give him passord access
15# change veillard and abcde with the right user name and passwd
16#    mysql -p
17#    password:
18#    mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
19#           IDENTIFIED BY 'abcde' WITH GRANT OPTION;
20#
21# As the user check the access:
22#    mysql -p xmlsoft
23#    Enter password:
24#    Welcome to the MySQL monitor....
25#    mysql> use xmlsoft
26#    Database changed
27#    mysql> quit
28#    Bye
29#
30# Then run the script in the doc subdir, it will create the symbols and
31# word tables and populate them with informations extracted from
32# the libxml2-api.xml API description, and make then accessible read-only
33# by nobody@loaclhost the user expected to be Apache's one
34#
35# On the Apache configuration, make sure you have php support enabled
36#
37
38import MySQLdb
39import libxml2
40import sys
41import string
42import os
43
44#
45# We are not interested in parsing errors here
46#
47def callback(ctx, str):
48    return
49libxml2.registerErrorHandler(callback, None)
50
51#
52# The dictionary of tables required and the SQL command needed
53# to create them
54#
55TABLES={
56  "symbols" : """CREATE TABLE symbols (
57           name varchar(255) BINARY NOT NULL,
58	   module varchar(255) BINARY NOT NULL,
59           type varchar(25) NOT NULL,
60	   descr varchar(255),
61	   UNIQUE KEY name (name),
62	   KEY module (module))""",
63  "words" : """CREATE TABLE words (
64           name varchar(50) BINARY NOT NULL,
65	   symbol varchar(255) BINARY NOT NULL,
66           relevance int,
67	   KEY name (name),
68	   KEY symbol (symbol),
69	   UNIQUE KEY ID (name, symbol))""",
70  "wordsHTML" : """CREATE TABLE wordsHTML (
71           name varchar(50) BINARY NOT NULL,
72	   resource varchar(255) BINARY NOT NULL,
73	   section varchar(255),
74	   id varchar(50),
75           relevance int,
76	   KEY name (name),
77	   KEY resource (resource),
78	   UNIQUE KEY ref (name, resource))""",
79  "wordsArchive" : """CREATE TABLE wordsArchive (
80           name varchar(50) BINARY NOT NULL,
81	   ID int(11) NOT NULL,
82           relevance int,
83	   KEY name (name),
84	   UNIQUE KEY ref (name, ID))""",
85  "pages" : """CREATE TABLE pages (
86           resource varchar(255) BINARY NOT NULL,
87	   title varchar(255) BINARY NOT NULL,
88	   UNIQUE KEY name (resource))""",
89  "archives" : """CREATE TABLE archives (
90           ID int(11) NOT NULL auto_increment,
91           resource varchar(255) BINARY NOT NULL,
92	   title varchar(255) BINARY NOT NULL,
93	   UNIQUE KEY id (ID,resource(255)),
94	   INDEX (ID),
95	   INDEX (resource))""",
96  "Queries" : """CREATE TABLE Queries (
97           ID int(11) NOT NULL auto_increment,
98	   Value varchar(50) NOT NULL,
99	   Count int(11) NOT NULL,
100	   UNIQUE KEY id (ID,Value(35)),
101	   INDEX (ID))""",
102  "AllQueries" : """CREATE TABLE AllQueries (
103           ID int(11) NOT NULL auto_increment,
104	   Value varchar(50) NOT NULL,
105	   Count int(11) NOT NULL,
106	   UNIQUE KEY id (ID,Value(35)),
107	   INDEX (ID))""",
108}
109
110#
111# The XML API description file to parse
112#
113API="libxml2-api.xml"
114DB=None
115
116#########################################################################
117#									#
118#                  MySQL database interfaces				#
119#									#
120#########################################################################
121def createTable(db, name):
122    global TABLES
123
124    if db == None:
125        return -1
126    if name == None:
127        return -1
128    c = db.cursor()
129
130    ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
131    if ret == 1:
132        print "Removed table %s" % (name)
133    print "Creating table %s" % (name)
134    try:
135        ret = c.execute(TABLES[name])
136    except:
137        print "Failed to create table %s" % (name)
138	return -1
139    return ret
140
141def checkTables(db, verbose = 1):
142    global TABLES
143
144    if db == None:
145        return -1
146    c = db.cursor()
147    nbtables = c.execute("show tables")
148    if verbose:
149	print "Found %d tables" % (nbtables)
150    tables = {}
151    i = 0
152    while i < nbtables:
153        l = c.fetchone()
154	name = l[0]
155	tables[name] = {}
156        i = i + 1
157
158    for table in TABLES.keys():
159        if not tables.has_key(table):
160	    print "table %s missing" % (table)
161	    createTable(db, table)
162	try:
163	    ret = c.execute("SELECT count(*) from %s" % table);
164	    row = c.fetchone()
165	    if verbose:
166		print "Table %s contains %d records" % (table, row[0])
167	except:
168	    print "Troubles with table %s : repairing" % (table)
169	    ret = c.execute("repair table %s" % table);
170	    print "repairing returned %d" % (ret)
171	    ret = c.execute("SELECT count(*) from %s" % table);
172	    row = c.fetchone()
173	    print "Table %s contains %d records" % (table, row[0])
174    if verbose:
175	print "checkTables finished"
176
177    # make sure apache can access the tables read-only
178    try:
179	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
180	ret = c.execute("GRANT INSERT,SELECT,UPDATE  ON xmlsoft.Queries TO nobody@localhost")
181    except:
182        pass
183    return 0
184
185def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
186    global DB
187
188    if passwd == None:
189        try:
190	    passwd = os.environ["MySQL_PASS"]
191	except:
192	    print "No password available, set environment MySQL_PASS"
193	    sys.exit(1)
194
195    DB = MySQLdb.connect(passwd=passwd, db=db)
196    if DB == None:
197        return -1
198    ret = checkTables(DB, verbose)
199    return ret
200
201def updateWord(name, symbol, relevance):
202    global DB
203
204    if DB == None:
205        openMySQL()
206    if DB == None:
207        return -1
208    if name == None:
209        return -1
210    if symbol == None:
211        return -1
212
213    c = DB.cursor()
214    try:
215	ret = c.execute(
216"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
217		(name, symbol, relevance))
218    except:
219        try:
220	    ret = c.execute(
221    """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
222		    (relevance, name, symbol))
223	except:
224	    print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
225	    print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
226	    print sys.exc_type, sys.exc_value
227	    return -1
228
229    return ret
230
231def updateSymbol(name, module, type, desc):
232    global DB
233
234    updateWord(name, name, 50)
235    if DB == None:
236        openMySQL()
237    if DB == None:
238        return -1
239    if name == None:
240        return -1
241    if module == None:
242        return -1
243    if type == None:
244        return -1
245
246    try:
247	desc = string.replace(desc, "'", " ")
248	l = string.split(desc, ".")
249	desc = l[0]
250	desc = desc[0:99]
251    except:
252        desc = ""
253
254    c = DB.cursor()
255    try:
256	ret = c.execute(
257"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
258                    (name, module, type, desc))
259    except:
260        try:
261	    ret = c.execute(
262"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
263                    (module, type, desc, name))
264        except:
265	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
266	    print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
267	    print sys.exc_type, sys.exc_value
268	    return -1
269
270    return ret
271
272def addFunction(name, module, desc = ""):
273    return updateSymbol(name, module, 'function', desc)
274
275def addMacro(name, module, desc = ""):
276    return updateSymbol(name, module, 'macro', desc)
277
278def addEnum(name, module, desc = ""):
279    return updateSymbol(name, module, 'enum', desc)
280
281def addStruct(name, module, desc = ""):
282    return updateSymbol(name, module, 'struct', desc)
283
284def addConst(name, module, desc = ""):
285    return updateSymbol(name, module, 'const', desc)
286
287def addType(name, module, desc = ""):
288    return updateSymbol(name, module, 'type', desc)
289
290def addFunctype(name, module, desc = ""):
291    return updateSymbol(name, module, 'functype', desc)
292
293def addPage(resource, title):
294    global DB
295
296    if DB == None:
297        openMySQL()
298    if DB == None:
299        return -1
300    if resource == None:
301        return -1
302
303    c = DB.cursor()
304    try:
305	ret = c.execute(
306	    """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
307                    (resource, title))
308    except:
309        try:
310	    ret = c.execute(
311		"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
312                    (title, resource))
313        except:
314	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
315	    print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
316	    print sys.exc_type, sys.exc_value
317	    return -1
318
319    return ret
320
321def updateWordHTML(name, resource, desc, id, relevance):
322    global DB
323
324    if DB == None:
325        openMySQL()
326    if DB == None:
327        return -1
328    if name == None:
329        return -1
330    if resource == None:
331        return -1
332    if id == None:
333        id = ""
334    if desc == None:
335        desc = ""
336    else:
337	try:
338	    desc = string.replace(desc, "'", " ")
339	    desc = desc[0:99]
340	except:
341	    desc = ""
342
343    c = DB.cursor()
344    try:
345	ret = c.execute(
346"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
347                    (name, resource, desc, id, relevance))
348    except:
349        try:
350	    ret = c.execute(
351"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
352                    (desc, id, relevance, name, resource))
353        except:
354	    print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
355	    print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
356	    print sys.exc_type, sys.exc_value
357	    return -1
358
359    return ret
360
361def checkXMLMsgArchive(url):
362    global DB
363
364    if DB == None:
365        openMySQL()
366    if DB == None:
367        return -1
368    if url == None:
369        return -1
370
371    c = DB.cursor()
372    try:
373	ret = c.execute(
374	    """SELECT ID FROM archives WHERE resource='%s'""" % (url))
375	row = c.fetchone()
376	if row == None:
377	    return -1
378    except:
379	return -1
380
381    return row[0]
382
383def addXMLMsgArchive(url, title):
384    global DB
385
386    if DB == None:
387        openMySQL()
388    if DB == None:
389        return -1
390    if url == None:
391        return -1
392    if title == None:
393        title = ""
394    else:
395	title = string.replace(title, "'", " ")
396	title = title[0:99]
397
398    c = DB.cursor()
399    try:
400        cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
401        ret = c.execute(cmd)
402	cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
403        ret = c.execute(cmd)
404	row = c.fetchone()
405	if row == None:
406	    print "addXMLMsgArchive failed to get the ID: %s" % (url)
407	    return -1
408    except:
409        print "addXMLMsgArchive failed command: %s" % (cmd)
410	return -1
411
412    return((int)(row[0]))
413
414def updateWordArchive(name, id, relevance):
415    global DB
416
417    if DB == None:
418        openMySQL()
419    if DB == None:
420        return -1
421    if name == None:
422        return -1
423    if id == None:
424        return -1
425
426    c = DB.cursor()
427    try:
428	ret = c.execute(
429"""INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
430                    (name, id, relevance))
431    except:
432        try:
433	    ret = c.execute(
434"""UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
435                    (relevance, name, id))
436        except:
437	    print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
438	    print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
439	    print sys.exc_type, sys.exc_value
440	    return -1
441
442    return ret
443
444#########################################################################
445#									#
446#                  Word dictionary and analysis routines		#
447#									#
448#########################################################################
449
450#
451# top 100 english word without the one len < 3 + own set
452#
453dropWords = {
454    'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
455    'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
456    'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
457    'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
458    'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
459    'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
460    'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
461    'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
462    'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
463    'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
464    'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
465    'down':0,
466    'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
467}
468
469wordsDict = {}
470wordsDictHTML = {}
471wordsDictArchive = {}
472
473def cleanupWordsString(str):
474    str = string.replace(str, ".", " ")
475    str = string.replace(str, "!", " ")
476    str = string.replace(str, "?", " ")
477    str = string.replace(str, ",", " ")
478    str = string.replace(str, "'", " ")
479    str = string.replace(str, '"', " ")
480    str = string.replace(str, ";", " ")
481    str = string.replace(str, "(", " ")
482    str = string.replace(str, ")", " ")
483    str = string.replace(str, "{", " ")
484    str = string.replace(str, "}", " ")
485    str = string.replace(str, "<", " ")
486    str = string.replace(str, ">", " ")
487    str = string.replace(str, "=", " ")
488    str = string.replace(str, "/", " ")
489    str = string.replace(str, "*", " ")
490    str = string.replace(str, ":", " ")
491    str = string.replace(str, "#", " ")
492    str = string.replace(str, "\\", " ")
493    str = string.replace(str, "\n", " ")
494    str = string.replace(str, "\r", " ")
495    str = string.replace(str, "\xc2", " ")
496    str = string.replace(str, "\xa0", " ")
497    return str
498
499def cleanupDescrString(str):
500    str = string.replace(str, "'", " ")
501    str = string.replace(str, "\n", " ")
502    str = string.replace(str, "\r", " ")
503    str = string.replace(str, "\xc2", " ")
504    str = string.replace(str, "\xa0", " ")
505    l = string.split(str)
506    str = string.join(str)
507    return str
508
509def splitIdentifier(str):
510    ret = []
511    while str != "":
512        cur = string.lower(str[0])
513	str = str[1:]
514	if ((cur < 'a') or (cur > 'z')):
515	    continue
516	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
517	    cur = cur + string.lower(str[0])
518	    str = str[1:]
519	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
520	    cur = cur + str[0]
521	    str = str[1:]
522	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
523	    str = str[1:]
524	ret.append(cur)
525    return ret
526
527def addWord(word, module, symbol, relevance):
528    global wordsDict
529
530    if word == None or len(word) < 3:
531        return -1
532    if module == None or symbol == None:
533        return -1
534    if dropWords.has_key(word):
535        return 0
536    if ord(word[0]) > 0x80:
537        return 0
538
539    if wordsDict.has_key(word):
540        d = wordsDict[word]
541	if d == None:
542	    return 0
543	if len(d) > 500:
544	    wordsDict[word] = None
545	    return 0
546	try:
547	    relevance = relevance + d[(module, symbol)]
548	except:
549	    pass
550    else:
551        wordsDict[word] = {}
552    wordsDict[word][(module, symbol)] = relevance
553    return relevance
554
555def addString(str, module, symbol, relevance):
556    if str == None or len(str) < 3:
557        return -1
558    ret = 0
559    str = cleanupWordsString(str)
560    l = string.split(str)
561    for word in l:
562	if len(word) > 2:
563	    ret = ret + addWord(word, module, symbol, 5)
564
565    return ret
566
567def addWordHTML(word, resource, id, section, relevance):
568    global wordsDictHTML
569
570    if word == None or len(word) < 3:
571        return -1
572    if resource == None or section == None:
573        return -1
574    if dropWords.has_key(word):
575        return 0
576    if ord(word[0]) > 0x80:
577        return 0
578
579    section = cleanupDescrString(section)
580
581    if wordsDictHTML.has_key(word):
582        d = wordsDictHTML[word]
583	if d == None:
584	    print "skipped %s" % (word)
585	    return 0
586	try:
587	    (r,i,s) = d[resource]
588	    if i != None:
589	        id = i
590	    if s != None:
591	        section = s
592	    relevance = relevance + r
593	except:
594	    pass
595    else:
596        wordsDictHTML[word] = {}
597    d = wordsDictHTML[word];
598    d[resource] = (relevance, id, section)
599    return relevance
600
601def addStringHTML(str, resource, id, section, relevance):
602    if str == None or len(str) < 3:
603        return -1
604    ret = 0
605    str = cleanupWordsString(str)
606    l = string.split(str)
607    for word in l:
608	if len(word) > 2:
609	    try:
610		r = addWordHTML(word, resource, id, section, relevance)
611		if r < 0:
612		    print "addWordHTML failed: %s %s" % (word, resource)
613		ret = ret + r
614	    except:
615		print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
616		print sys.exc_type, sys.exc_value
617
618    return ret
619
620def addWordArchive(word, id, relevance):
621    global wordsDictArchive
622
623    if word == None or len(word) < 3:
624        return -1
625    if id == None or id == -1:
626        return -1
627    if dropWords.has_key(word):
628        return 0
629    if ord(word[0]) > 0x80:
630        return 0
631
632    if wordsDictArchive.has_key(word):
633        d = wordsDictArchive[word]
634	if d == None:
635	    print "skipped %s" % (word)
636	    return 0
637	try:
638	    r = d[id]
639	    relevance = relevance + r
640	except:
641	    pass
642    else:
643        wordsDictArchive[word] = {}
644    d = wordsDictArchive[word];
645    d[id] = relevance
646    return relevance
647
648def addStringArchive(str, id, relevance):
649    if str == None or len(str) < 3:
650        return -1
651    ret = 0
652    str = cleanupWordsString(str)
653    l = string.split(str)
654    for word in l:
655        i = len(word)
656	if i > 2:
657	    try:
658		r = addWordArchive(word, id, relevance)
659		if r < 0:
660		    print "addWordArchive failed: %s %s" % (word, id)
661		else:
662		    ret = ret + r
663	    except:
664		print "addWordArchive failed: %s %s %d" % (word, id, relevance)
665		print sys.exc_type, sys.exc_value
666    return ret
667
668#########################################################################
669#									#
670#                  XML API description analysis				#
671#									#
672#########################################################################
673
674def loadAPI(filename):
675    doc = libxml2.parseFile(filename)
676    print "loaded %s" % (filename)
677    return doc
678
679def foundExport(file, symbol):
680    if file == None:
681        return 0
682    if symbol == None:
683        return 0
684    addFunction(symbol, file)
685    l = splitIdentifier(symbol)
686    for word in l:
687	addWord(word, file, symbol, 10)
688    return 1
689
690def analyzeAPIFile(top):
691    count = 0
692    name = top.prop("name")
693    cur = top.children
694    while cur != None:
695        if cur.type == 'text':
696	    cur = cur.next
697	    continue
698	if cur.name == "exports":
699	    count = count + foundExport(name, cur.prop("symbol"))
700	else:
701	    print "unexpected element %s in API doc <file name='%s'>" % (name)
702        cur = cur.next
703    return count
704
705def analyzeAPIFiles(top):
706    count = 0
707    cur = top.children
708
709    while cur != None:
710        if cur.type == 'text':
711	    cur = cur.next
712	    continue
713	if cur.name == "file":
714	    count = count + analyzeAPIFile(cur)
715	else:
716	    print "unexpected element %s in API doc <files>" % (cur.name)
717        cur = cur.next
718    return count
719
720def analyzeAPIEnum(top):
721    file = top.prop("file")
722    if file == None:
723        return 0
724    symbol = top.prop("name")
725    if symbol == None:
726        return 0
727
728    addEnum(symbol, file)
729    l = splitIdentifier(symbol)
730    for word in l:
731	addWord(word, file, symbol, 10)
732
733    return 1
734
735def analyzeAPIConst(top):
736    file = top.prop("file")
737    if file == None:
738        return 0
739    symbol = top.prop("name")
740    if symbol == None:
741        return 0
742
743    addConst(symbol, file)
744    l = splitIdentifier(symbol)
745    for word in l:
746	addWord(word, file, symbol, 10)
747
748    return 1
749
750def analyzeAPIType(top):
751    file = top.prop("file")
752    if file == None:
753        return 0
754    symbol = top.prop("name")
755    if symbol == None:
756        return 0
757
758    addType(symbol, file)
759    l = splitIdentifier(symbol)
760    for word in l:
761	addWord(word, file, symbol, 10)
762    return 1
763
764def analyzeAPIFunctype(top):
765    file = top.prop("file")
766    if file == None:
767        return 0
768    symbol = top.prop("name")
769    if symbol == None:
770        return 0
771
772    addFunctype(symbol, file)
773    l = splitIdentifier(symbol)
774    for word in l:
775	addWord(word, file, symbol, 10)
776    return 1
777
778def analyzeAPIStruct(top):
779    file = top.prop("file")
780    if file == None:
781        return 0
782    symbol = top.prop("name")
783    if symbol == None:
784        return 0
785
786    addStruct(symbol, file)
787    l = splitIdentifier(symbol)
788    for word in l:
789	addWord(word, file, symbol, 10)
790
791    info = top.prop("info")
792    if info != None:
793	info = string.replace(info, "'", " ")
794	info = string.strip(info)
795	l = string.split(info)
796	for word in l:
797	    if len(word) > 2:
798		addWord(word, file, symbol, 5)
799    return 1
800
801def analyzeAPIMacro(top):
802    file = top.prop("file")
803    if file == None:
804        return 0
805    symbol = top.prop("name")
806    if symbol == None:
807        return 0
808    symbol = string.replace(symbol, "'", " ")
809    symbol = string.strip(symbol)
810
811    info = None
812    cur = top.children
813    while cur != None:
814        if cur.type == 'text':
815	    cur = cur.next
816	    continue
817	if cur.name == "info":
818	    info = cur.content
819	    break
820        cur = cur.next
821
822    l = splitIdentifier(symbol)
823    for word in l:
824	addWord(word, file, symbol, 10)
825
826    if info == None:
827	addMacro(symbol, file)
828        print "Macro %s description has no <info>" % (symbol)
829        return 0
830
831    info = string.replace(info, "'", " ")
832    info = string.strip(info)
833    addMacro(symbol, file, info)
834    l = string.split(info)
835    for word in l:
836	if len(word) > 2:
837	    addWord(word, file, symbol, 5)
838    return 1
839
840def analyzeAPIFunction(top):
841    file = top.prop("file")
842    if file == None:
843        return 0
844    symbol = top.prop("name")
845    if symbol == None:
846        return 0
847
848    symbol = string.replace(symbol, "'", " ")
849    symbol = string.strip(symbol)
850    info = None
851    cur = top.children
852    while cur != None:
853        if cur.type == 'text':
854	    cur = cur.next
855	    continue
856	if cur.name == "info":
857	    info = cur.content
858	elif cur.name == "return":
859	    rinfo = cur.prop("info")
860	    if rinfo != None:
861		rinfo = string.replace(rinfo, "'", " ")
862		rinfo = string.strip(rinfo)
863	        addString(rinfo, file, symbol, 7)
864	elif cur.name == "arg":
865	    ainfo = cur.prop("info")
866	    if ainfo != None:
867		ainfo = string.replace(ainfo, "'", " ")
868		ainfo = string.strip(ainfo)
869	        addString(ainfo, file, symbol, 5)
870	    name = cur.prop("name")
871	    if name != None:
872		name = string.replace(name, "'", " ")
873		name = string.strip(name)
874	        addWord(name, file, symbol, 7)
875        cur = cur.next
876    if info == None:
877        print "Function %s description has no <info>" % (symbol)
878	addFunction(symbol, file, "")
879    else:
880        info = string.replace(info, "'", " ")
881	info = string.strip(info)
882	addFunction(symbol, file, info)
883        addString(info, file, symbol, 5)
884
885    l = splitIdentifier(symbol)
886    for word in l:
887	addWord(word, file, symbol, 10)
888
889    return 1
890
891def analyzeAPISymbols(top):
892    count = 0
893    cur = top.children
894
895    while cur != None:
896        if cur.type == 'text':
897	    cur = cur.next
898	    continue
899	if cur.name == "macro":
900	    count = count + analyzeAPIMacro(cur)
901	elif cur.name == "function":
902	    count = count + analyzeAPIFunction(cur)
903	elif cur.name == "const":
904	    count = count + analyzeAPIConst(cur)
905	elif cur.name == "typedef":
906	    count = count + analyzeAPIType(cur)
907	elif cur.name == "struct":
908	    count = count + analyzeAPIStruct(cur)
909	elif cur.name == "enum":
910	    count = count + analyzeAPIEnum(cur)
911	elif cur.name == "functype":
912	    count = count + analyzeAPIFunctype(cur)
913	else:
914	    print "unexpected element %s in API doc <files>" % (cur.name)
915        cur = cur.next
916    return count
917
918def analyzeAPI(doc):
919    count = 0
920    if doc == None:
921        return -1
922    root = doc.getRootElement()
923    if root.name != "api":
924        print "Unexpected root name"
925        return -1
926    cur = root.children
927    while cur != None:
928        if cur.type == 'text':
929	    cur = cur.next
930	    continue
931	if cur.name == "files":
932	    pass
933#	    count = count + analyzeAPIFiles(cur)
934	elif cur.name == "symbols":
935	    count = count + analyzeAPISymbols(cur)
936	else:
937	    print "unexpected element %s in API doc" % (cur.name)
938        cur = cur.next
939    return count
940
941#########################################################################
942#									#
943#                  Web pages parsing and analysis			#
944#									#
945#########################################################################
946
947import glob
948
949def analyzeHTMLText(doc, resource, p, section, id):
950    words = 0
951    try:
952	content = p.content
953	words = words + addStringHTML(content, resource, id, section, 5)
954    except:
955        return -1
956    return words
957
958def analyzeHTMLPara(doc, resource, p, section, id):
959    words = 0
960    try:
961	content = p.content
962	words = words + addStringHTML(content, resource, id, section, 5)
963    except:
964        return -1
965    return words
966
967def analyzeHTMLPre(doc, resource, p, section, id):
968    words = 0
969    try:
970	content = p.content
971	words = words + addStringHTML(content, resource, id, section, 5)
972    except:
973        return -1
974    return words
975
976def analyzeHTML(doc, resource, p, section, id):
977    words = 0
978    try:
979	content = p.content
980	words = words + addStringHTML(content, resource, id, section, 5)
981    except:
982        return -1
983    return words
984
985def analyzeHTML(doc, resource):
986    para = 0;
987    ctxt = doc.xpathNewContext()
988    try:
989	res = ctxt.xpathEval("//head/title")
990	title = res[0].content
991    except:
992        title = "Page %s" % (resource)
993    addPage(resource, title)
994    try:
995	items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
996	section = title
997	id = ""
998	for item in items:
999	    if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
1000	        section = item.content
1001		if item.prop("id"):
1002		    id = item.prop("id")
1003		elif item.prop("name"):
1004		    id = item.prop("name")
1005	    elif item.type == 'text':
1006	        analyzeHTMLText(doc, resource, item, section, id)
1007		para = para + 1
1008	    elif item.name == 'p':
1009	        analyzeHTMLPara(doc, resource, item, section, id)
1010		para = para + 1
1011	    elif item.name == 'pre':
1012	        analyzeHTMLPre(doc, resource, item, section, id)
1013		para = para + 1
1014	    else:
1015	        print "Page %s, unexpected %s element" % (resource, item.name)
1016    except:
1017        print "Page %s: problem analyzing" % (resource)
1018	print sys.exc_type, sys.exc_value
1019
1020    return para
1021
1022def analyzeHTMLPages():
1023    ret = 0
1024    HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
1025    for html in HTMLfiles:
1026	if html[0:3] == "API":
1027	    continue
1028	if html == "xml.html":
1029	    continue
1030	try:
1031	    doc = libxml2.parseFile(html)
1032	except:
1033	    doc = libxml2.htmlParseFile(html, None)
1034	try:
1035	    res = analyzeHTML(doc, html)
1036	    print "Parsed %s : %d paragraphs" % (html, res)
1037	    ret = ret + 1
1038	except:
1039	    print "could not parse %s" % (html)
1040    return ret
1041
1042#########################################################################
1043#									#
1044#                  Mail archives parsing and analysis			#
1045#									#
1046#########################################################################
1047
1048import time
1049
1050def getXMLDateArchive(t = None):
1051    if t == None:
1052	t = time.time()
1053    T = time.gmtime(t)
1054    month = time.strftime("%B", T)
1055    year = T[0]
1056    url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
1057    return url
1058
1059def scanXMLMsgArchive(url, title, force = 0):
1060    if url == None or title == None:
1061        return 0
1062
1063    ID = checkXMLMsgArchive(url)
1064    if force == 0 and ID != -1:
1065        return 0
1066
1067    if ID == -1:
1068	ID = addXMLMsgArchive(url, title)
1069	if ID == -1:
1070	    return 0
1071
1072    try:
1073        print "Loading %s" % (url)
1074        doc = libxml2.htmlParseFile(url, None);
1075    except:
1076        doc = None
1077    if doc == None:
1078        print "Failed to parse %s" % (url)
1079	return 0
1080
1081    addStringArchive(title, ID, 20)
1082    ctxt = doc.xpathNewContext()
1083    texts = ctxt.xpathEval("//pre//text()")
1084    for text in texts:
1085        addStringArchive(text.content, ID, 5)
1086
1087    return 1
1088
1089def scanXMLDateArchive(t = None, force = 0):
1090    global wordsDictArchive
1091
1092    wordsDictArchive = {}
1093
1094    url = getXMLDateArchive(t)
1095    print "loading %s" % (url)
1096    try:
1097	doc = libxml2.htmlParseFile(url, None);
1098    except:
1099        doc = None
1100    if doc == None:
1101        print "Failed to parse %s" % (url)
1102	return -1
1103    ctxt = doc.xpathNewContext()
1104    anchors = ctxt.xpathEval("//a[@href]")
1105    links = 0
1106    newmsg = 0
1107    for anchor in anchors:
1108	href = anchor.prop("href")
1109	if href == None or href[0:3] != "msg":
1110	    continue
1111        try:
1112	    links = links + 1
1113
1114	    msg = libxml2.buildURI(href, url)
1115	    title = anchor.content
1116	    if title != None and title[0:4] == 'Re: ':
1117	        title = title[4:]
1118	    if title != None and title[0:6] == '[xml] ':
1119	        title = title[6:]
1120	    newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
1121
1122	except:
1123	    pass
1124
1125    return newmsg
1126
1127
1128#########################################################################
1129#									#
1130#          Main code: open the DB, the API XML and analyze it		#
1131#									#
1132#########################################################################
1133def analyzeArchives(t = None, force = 0):
1134    global wordsDictArchive
1135
1136    ret = scanXMLDateArchive(t, force)
1137    print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
1138
1139    i = 0
1140    skipped = 0
1141    for word in wordsDictArchive.keys():
1142	refs = wordsDictArchive[word]
1143	if refs  == None:
1144	    skipped = skipped + 1
1145	    continue;
1146	for id in refs.keys():
1147	    relevance = refs[id]
1148	    updateWordArchive(word, id, relevance)
1149	    i = i + 1
1150
1151    print "Found %d associations in HTML pages" % (i)
1152
1153def analyzeHTMLTop():
1154    global wordsDictHTML
1155
1156    ret = analyzeHTMLPages()
1157    print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
1158
1159    i = 0
1160    skipped = 0
1161    for word in wordsDictHTML.keys():
1162	refs = wordsDictHTML[word]
1163	if refs  == None:
1164	    skipped = skipped + 1
1165	    continue;
1166	for resource in refs.keys():
1167	    (relevance, id, section) = refs[resource]
1168	    updateWordHTML(word, resource, section, id, relevance)
1169	    i = i + 1
1170
1171    print "Found %d associations in HTML pages" % (i)
1172
1173def analyzeAPITop():
1174    global wordsDict
1175    global API
1176
1177    try:
1178	doc = loadAPI(API)
1179	ret = analyzeAPI(doc)
1180	print "Analyzed %d blocs" % (ret)
1181	doc.freeDoc()
1182    except:
1183	print "Failed to parse and analyze %s" % (API)
1184	print sys.exc_type, sys.exc_value
1185	sys.exit(1)
1186
1187    print "Indexed %d words" % (len(wordsDict))
1188    i = 0
1189    skipped = 0
1190    for word in wordsDict.keys():
1191	refs = wordsDict[word]
1192	if refs  == None:
1193	    skipped = skipped + 1
1194	    continue;
1195	for (module, symbol) in refs.keys():
1196	    updateWord(word, symbol, refs[(module, symbol)])
1197	    i = i + 1
1198
1199    print "Found %d associations, skipped %d words" % (i, skipped)
1200
1201def usage():
1202    print "Usage index.py [--force] [--archive]  [--archive-year year] [--archive-month month] [--API] [--docs]"
1203    sys.exit(1)
1204
1205def main():
1206    try:
1207	openMySQL()
1208    except:
1209	print "Failed to open the database"
1210	print sys.exc_type, sys.exc_value
1211	sys.exit(1)
1212
1213    args = sys.argv[1:]
1214    force = 0
1215    if args:
1216        i = 0
1217	while i < len(args):
1218	    if args[i] == '--force':
1219	        force = 1
1220	    elif args[i] == '--archive':
1221	        analyzeArchives(None, force)
1222	    elif args[i] == '--archive-year':
1223	        i = i + 1;
1224		year = args[i]
1225		months = ["January" , "February", "March", "April", "May",
1226			  "June", "July", "August", "September", "October",
1227			  "November", "December"];
1228	        for month in months:
1229		    try:
1230		        str = "%s-%s" % (year, month)
1231			T = time.strptime(str, "%Y-%B")
1232			t = time.mktime(T) + 3600 * 24 * 10;
1233			analyzeArchives(t, force)
1234		    except:
1235			print "Failed to index month archive:"
1236			print sys.exc_type, sys.exc_value
1237	    elif args[i] == '--archive-month':
1238	        i = i + 1;
1239		month = args[i]
1240		try:
1241		    T = time.strptime(month, "%Y-%B")
1242		    t = time.mktime(T) + 3600 * 24 * 10;
1243		    analyzeArchives(t, force)
1244		except:
1245		    print "Failed to index month archive:"
1246		    print sys.exc_type, sys.exc_value
1247	    elif args[i] == '--API':
1248	        analyzeAPITop()
1249	    elif args[i] == '--docs':
1250	        analyzeHTMLTop()
1251	    else:
1252	        usage()
1253	    i = i + 1
1254    else:
1255        usage()
1256
1257if __name__ == "__main__":
1258    main()
1259