1"""Parse a Python module and describe its classes and methods.
2
3Parse enough of a Python file to recognize imports and class and
4method definitions, and to find out the superclasses of a class.
5
6The interface consists of a single function:
7        readmodule_ex(module [, path])
8where module is the name of a Python module, and path is an optional
9list of directories where the module is to be searched.  If present,
10path is prepended to the system search path sys.path.  The return
11value is a dictionary.  The keys of the dictionary are the names of
12the classes defined in the module (including classes that are defined
13via the from XXX import YYY construct).  The values are class
14instances of the class Class defined here.  One special key/value pair
15is present for packages: the key '__path__' has a list as its value
16which contains the package search path.
17
18A class is described by the class Class in this module.  Instances
19of this class have the following instance variables:
20        module -- the module name
21        name -- the name of the class
22        super -- a list of super classes (Class instances)
23        methods -- a dictionary of methods
24        file -- the file in which the class was defined
25        lineno -- the line in the file on which the class statement occurred
26The dictionary of methods uses the method names as keys and the line
27numbers on which the method was defined as values.
28If the name of a super class is not recognized, the corresponding
29entry in the list of super classes is not a class instance but a
30string giving the name of the super class.  Since import statements
31are recognized and imported modules are scanned as well, this
32shouldn't happen often.
33
34A function is described by the class Function in this module.
35Instances of this class have the following instance variables:
36        module -- the module name
37        name -- the name of the class
38        file -- the file in which the class was defined
39        lineno -- the line in the file on which the class statement occurred
40"""
41
42import sys
43import imp
44import tokenize
45from token import NAME, DEDENT, OP
46from operator import itemgetter
47
48__all__ = ["readmodule", "readmodule_ex", "Class", "Function"]
49
50_modules = {}                           # cache of modules we've seen
51
52# each Python class is represented by an instance of this class
53class Class:
54    '''Class to represent a Python class.'''
55    def __init__(self, module, name, super, file, lineno):
56        self.module = module
57        self.name = name
58        if super is None:
59            super = []
60        self.super = super
61        self.methods = {}
62        self.file = file
63        self.lineno = lineno
64
65    def _addmethod(self, name, lineno):
66        self.methods[name] = lineno
67
68class Function:
69    '''Class to represent a top-level Python function'''
70    def __init__(self, module, name, file, lineno):
71        self.module = module
72        self.name = name
73        self.file = file
74        self.lineno = lineno
75
76def readmodule(module, path=None):
77    '''Backwards compatible interface.
78
79    Call readmodule_ex() and then only keep Class objects from the
80    resulting dictionary.'''
81
82    res = {}
83    for key, value in _readmodule(module, path or []).items():
84        if isinstance(value, Class):
85            res[key] = value
86    return res
87
88def readmodule_ex(module, path=None):
89    '''Read a module file and return a dictionary of classes.
90
91    Search for MODULE in PATH and sys.path, read and parse the
92    module and return a dictionary with one entry for each class
93    found in the module.
94    '''
95    return _readmodule(module, path or [])
96
97def _readmodule(module, path, inpackage=None):
98    '''Do the hard work for readmodule[_ex].
99
100    If INPACKAGE is given, it must be the dotted name of the package in
101    which we are searching for a submodule, and then PATH must be the
102    package search path; otherwise, we are searching for a top-level
103    module, and PATH is combined with sys.path.
104    '''
105    # Compute the full module name (prepending inpackage if set)
106    if inpackage is not None:
107        fullmodule = "%s.%s" % (inpackage, module)
108    else:
109        fullmodule = module
110
111    # Check in the cache
112    if fullmodule in _modules:
113        return _modules[fullmodule]
114
115    # Initialize the dict for this module's contents
116    dict = {}
117
118    # Check if it is a built-in module; we don't do much for these
119    if module in sys.builtin_module_names and inpackage is None:
120        _modules[module] = dict
121        return dict
122
123    # Check for a dotted module name
124    i = module.rfind('.')
125    if i >= 0:
126        package = module[:i]
127        submodule = module[i+1:]
128        parent = _readmodule(package, path, inpackage)
129        if inpackage is not None:
130            package = "%s.%s" % (inpackage, package)
131        return _readmodule(submodule, parent['__path__'], package)
132
133    # Search the path for the module
134    f = None
135    if inpackage is not None:
136        f, fname, (_s, _m, ty) = imp.find_module(module, path)
137    else:
138        f, fname, (_s, _m, ty) = imp.find_module(module, path + sys.path)
139    if ty == imp.PKG_DIRECTORY:
140        dict['__path__'] = [fname]
141        path = [fname] + path
142        f, fname, (_s, _m, ty) = imp.find_module('__init__', [fname])
143    _modules[fullmodule] = dict
144    if ty != imp.PY_SOURCE:
145        # not Python source, can't do anything with this module
146        f.close()
147        return dict
148
149    stack = [] # stack of (class, indent) pairs
150
151    g = tokenize.generate_tokens(f.readline)
152    try:
153        for tokentype, token, start, _end, _line in g:
154            if tokentype == DEDENT:
155                lineno, thisindent = start
156                # close nested classes and defs
157                while stack and stack[-1][1] >= thisindent:
158                    del stack[-1]
159            elif token == 'def':
160                lineno, thisindent = start
161                # close previous nested classes and defs
162                while stack and stack[-1][1] >= thisindent:
163                    del stack[-1]
164                tokentype, meth_name, start = g.next()[0:3]
165                if tokentype != NAME:
166                    continue # Syntax error
167                if stack:
168                    cur_class = stack[-1][0]
169                    if isinstance(cur_class, Class):
170                        # it's a method
171                        cur_class._addmethod(meth_name, lineno)
172                    # else it's a nested def
173                else:
174                    # it's a function
175                    dict[meth_name] = Function(fullmodule, meth_name,
176                                               fname, lineno)
177                stack.append((None, thisindent)) # Marker for nested fns
178            elif token == 'class':
179                lineno, thisindent = start
180                # close previous nested classes and defs
181                while stack and stack[-1][1] >= thisindent:
182                    del stack[-1]
183                tokentype, class_name, start = g.next()[0:3]
184                if tokentype != NAME:
185                    continue # Syntax error
186                # parse what follows the class name
187                tokentype, token, start = g.next()[0:3]
188                inherit = None
189                if token == '(':
190                    names = [] # List of superclasses
191                    # there's a list of superclasses
192                    level = 1
193                    super = [] # Tokens making up current superclass
194                    while True:
195                        tokentype, token, start = g.next()[0:3]
196                        if token in (')', ',') and level == 1:
197                            n = "".join(super)
198                            if n in dict:
199                                # we know this super class
200                                n = dict[n]
201                            else:
202                                c = n.split('.')
203                                if len(c) > 1:
204                                    # super class is of the form
205                                    # module.class: look in module for
206                                    # class
207                                    m = c[-2]
208                                    c = c[-1]
209                                    if m in _modules:
210                                        d = _modules[m]
211                                        if c in d:
212                                            n = d[c]
213                            names.append(n)
214                            super = []
215                        if token == '(':
216                            level += 1
217                        elif token == ')':
218                            level -= 1
219                            if level == 0:
220                                break
221                        elif token == ',' and level == 1:
222                            pass
223                        # only use NAME and OP (== dot) tokens for type name
224                        elif tokentype in (NAME, OP) and level == 1:
225                            super.append(token)
226                        # expressions in the base list are not supported
227                    inherit = names
228                cur_class = Class(fullmodule, class_name, inherit,
229                                  fname, lineno)
230                if not stack:
231                    dict[class_name] = cur_class
232                stack.append((cur_class, thisindent))
233            elif token == 'import' and start[1] == 0:
234                modules = _getnamelist(g)
235                for mod, _mod2 in modules:
236                    try:
237                        # Recursively read the imported module
238                        if inpackage is None:
239                            _readmodule(mod, path)
240                        else:
241                            try:
242                                _readmodule(mod, path, inpackage)
243                            except ImportError:
244                                _readmodule(mod, [])
245                    except:
246                        # If we can't find or parse the imported module,
247                        # too bad -- don't die here.
248                        pass
249            elif token == 'from' and start[1] == 0:
250                mod, token = _getname(g)
251                if not mod or token != "import":
252                    continue
253                names = _getnamelist(g)
254                try:
255                    # Recursively read the imported module
256                    d = _readmodule(mod, path, inpackage)
257                except:
258                    # If we can't find or parse the imported module,
259                    # too bad -- don't die here.
260                    continue
261                # add any classes that were defined in the imported module
262                # to our name space if they were mentioned in the list
263                for n, n2 in names:
264                    if n in d:
265                        dict[n2 or n] = d[n]
266                    elif n == '*':
267                        # don't add names that start with _
268                        for n in d:
269                            if n[0] != '_':
270                                dict[n] = d[n]
271    except StopIteration:
272        pass
273
274    f.close()
275    return dict
276
277def _getnamelist(g):
278    # Helper to get a comma-separated list of dotted names plus 'as'
279    # clauses.  Return a list of pairs (name, name2) where name2 is
280    # the 'as' name, or None if there is no 'as' clause.
281    names = []
282    while True:
283        name, token = _getname(g)
284        if not name:
285            break
286        if token == 'as':
287            name2, token = _getname(g)
288        else:
289            name2 = None
290        names.append((name, name2))
291        while token != "," and "\n" not in token:
292            token = g.next()[1]
293        if token != ",":
294            break
295    return names
296
297def _getname(g):
298    # Helper to get a dotted name, return a pair (name, token) where
299    # name is the dotted name, or None if there was no dotted name,
300    # and token is the next input token.
301    parts = []
302    tokentype, token = g.next()[0:2]
303    if tokentype != NAME and token != '*':
304        return (None, token)
305    parts.append(token)
306    while True:
307        tokentype, token = g.next()[0:2]
308        if token != '.':
309            break
310        tokentype, token = g.next()[0:2]
311        if tokentype != NAME:
312            break
313        parts.append(token)
314    return (".".join(parts), token)
315
316def _main():
317    # Main program for testing.
318    import os
319    mod = sys.argv[1]
320    if os.path.exists(mod):
321        path = [os.path.dirname(mod)]
322        mod = os.path.basename(mod)
323        if mod.lower().endswith(".py"):
324            mod = mod[:-3]
325    else:
326        path = []
327    dict = readmodule_ex(mod, path)
328    objs = dict.values()
329    objs.sort(lambda a, b: cmp(getattr(a, 'lineno', 0),
330                               getattr(b, 'lineno', 0)))
331    for obj in objs:
332        if isinstance(obj, Class):
333            print "class", obj.name, obj.super, obj.lineno
334            methods = sorted(obj.methods.iteritems(), key=itemgetter(1))
335            for name, lineno in methods:
336                if name != "__path__":
337                    print "  def", name, lineno
338        elif isinstance(obj, Function):
339            print "def", obj.name, obj.lineno
340
341if __name__ == "__main__":
342    _main()
343