1"""Parse a Python module and describe its classes and methods.
2
3Parse enough of a Python file to recognize imports and class and
4method definitions, and to find out the superclasses of a class.
5
6The interface consists of a single function:
7        readmodule_ex(module [, path])
8where module is the name of a Python module, and path is an optional
9list of directories where the module is to be searched.  If present,
10path is prepended to the system search path sys.path.  The return
11value is a dictionary.  The keys of the dictionary are the names of
12the classes defined in the module (including classes that are defined
13via the from XXX import YYY construct).  The values are class
14instances of the class Class defined here.  One special key/value pair
15is present for packages: the key '__path__' has a list as its value
16which contains the package search path.
17
18A class is described by the class Class in this module.  Instances
19of this class have the following instance variables:
20        module -- the module name
21        name -- the name of the class
22        super -- a list of super classes (Class instances)
23        methods -- a dictionary of methods
24        file -- the file in which the class was defined
25        lineno -- the line in the file on which the class statement occurred
26The dictionary of methods uses the method names as keys and the line
27numbers on which the method was defined as values.
28If the name of a super class is not recognized, the corresponding
29entry in the list of super classes is not a class instance but a
30string giving the name of the super class.  Since import statements
31are recognized and imported modules are scanned as well, this
32shouldn't happen often.
33
34A function is described by the class Function in this module.
35Instances of this class have the following instance variables:
36        module -- the module name
37        name -- the name of the class
38        file -- the file in which the class was defined
39        lineno -- the line in the file on which the class statement occurred
40"""
41
42import sys
43import imp
44import tokenize
45from token import NAME, DEDENT, OP
46from operator import itemgetter
47
48__all__ = ["readmodule", "readmodule_ex", "Class", "Function"]
49
50_modules = {}                           # cache of modules we've seen
51
52# each Python class is represented by an instance of this class
53class Class:
54    '''Class to represent a Python class.'''
55    def __init__(self, module, name, super, file, lineno):
56        self.module = module
57        self.name = name
58        if super is None:
59            super = []
60        self.super = super
61        self.methods = {}
62        self.file = file
63        self.lineno = lineno
64
65    def _addmethod(self, name, lineno):
66        self.methods[name] = lineno
67
68class Function:
69    '''Class to represent a top-level Python function'''
70    def __init__(self, module, name, file, lineno):
71        self.module = module
72        self.name = name
73        self.file = file
74        self.lineno = lineno
75
76def readmodule(module, path=None):
77    '''Backwards compatible interface.
78
79    Call readmodule_ex() and then only keep Class objects from the
80    resulting dictionary.'''
81
82    res = {}
83    for key, value in _readmodule(module, path or []).items():
84        if isinstance(value, Class):
85            res[key] = value
86    return res
87
88def readmodule_ex(module, path=None):
89    '''Read a module file and return a dictionary of classes.
90
91    Search for MODULE in PATH and sys.path, read and parse the
92    module and return a dictionary with one entry for each class
93    found in the module.
94    '''
95    return _readmodule(module, path or [])
96
97def _readmodule(module, path, inpackage=None):
98    '''Do the hard work for readmodule[_ex].
99
100    If INPACKAGE is given, it must be the dotted name of the package in
101    which we are searching for a submodule, and then PATH must be the
102    package search path; otherwise, we are searching for a top-level
103    module, and PATH is combined with sys.path.
104    '''
105    # Compute the full module name (prepending inpackage if set)
106    if inpackage is not None:
107        fullmodule = "%s.%s" % (inpackage, module)
108    else:
109        fullmodule = module
110
111    # Check in the cache
112    if fullmodule in _modules:
113        return _modules[fullmodule]
114
115    # Initialize the dict for this module's contents
116    dict = {}
117
118    # Check if it is a built-in module; we don't do much for these
119    if module in sys.builtin_module_names and inpackage is None:
120        _modules[module] = dict
121        return dict
122
123    # Check for a dotted module name
124    i = module.rfind('.')
125    if i >= 0:
126        package = module[:i]
127        submodule = module[i+1:]
128        parent = _readmodule(package, path, inpackage)
129        if inpackage is not None:
130            package = "%s.%s" % (inpackage, package)
131        if not '__path__' in parent:
132            raise ImportError('No package named {}'.format(package))
133        return _readmodule(submodule, parent['__path__'], package)
134
135    # Search the path for the module
136    f = None
137    if inpackage is not None:
138        f, fname, (_s, _m, ty) = imp.find_module(module, path)
139    else:
140        f, fname, (_s, _m, ty) = imp.find_module(module, path + sys.path)
141    if ty == imp.PKG_DIRECTORY:
142        dict['__path__'] = [fname]
143        path = [fname] + path
144        f, fname, (_s, _m, ty) = imp.find_module('__init__', [fname])
145    _modules[fullmodule] = dict
146    if ty != imp.PY_SOURCE:
147        # not Python source, can't do anything with this module
148        f.close()
149        return dict
150
151    stack = [] # stack of (class, indent) pairs
152
153    g = tokenize.generate_tokens(f.readline)
154    try:
155        for tokentype, token, start, _end, _line in g:
156            if tokentype == DEDENT:
157                lineno, thisindent = start
158                # close nested classes and defs
159                while stack and stack[-1][1] >= thisindent:
160                    del stack[-1]
161            elif token == 'def':
162                lineno, thisindent = start
163                # close previous nested classes and defs
164                while stack and stack[-1][1] >= thisindent:
165                    del stack[-1]
166                tokentype, meth_name, start = g.next()[0:3]
167                if tokentype != NAME:
168                    continue # Syntax error
169                if stack:
170                    cur_class = stack[-1][0]
171                    if isinstance(cur_class, Class):
172                        # it's a method
173                        cur_class._addmethod(meth_name, lineno)
174                    # else it's a nested def
175                else:
176                    # it's a function
177                    dict[meth_name] = Function(fullmodule, meth_name,
178                                               fname, lineno)
179                stack.append((None, thisindent)) # Marker for nested fns
180            elif token == 'class':
181                lineno, thisindent = start
182                # close previous nested classes and defs
183                while stack and stack[-1][1] >= thisindent:
184                    del stack[-1]
185                tokentype, class_name, start = g.next()[0:3]
186                if tokentype != NAME:
187                    continue # Syntax error
188                # parse what follows the class name
189                tokentype, token, start = g.next()[0:3]
190                inherit = None
191                if token == '(':
192                    names = [] # List of superclasses
193                    # there's a list of superclasses
194                    level = 1
195                    super = [] # Tokens making up current superclass
196                    while True:
197                        tokentype, token, start = g.next()[0:3]
198                        if token in (')', ',') and level == 1:
199                            n = "".join(super)
200                            if n in dict:
201                                # we know this super class
202                                n = dict[n]
203                            else:
204                                c = n.split('.')
205                                if len(c) > 1:
206                                    # super class is of the form
207                                    # module.class: look in module for
208                                    # class
209                                    m = c[-2]
210                                    c = c[-1]
211                                    if m in _modules:
212                                        d = _modules[m]
213                                        if c in d:
214                                            n = d[c]
215                            names.append(n)
216                            super = []
217                        if token == '(':
218                            level += 1
219                        elif token == ')':
220                            level -= 1
221                            if level == 0:
222                                break
223                        elif token == ',' and level == 1:
224                            pass
225                        # only use NAME and OP (== dot) tokens for type name
226                        elif tokentype in (NAME, OP) and level == 1:
227                            super.append(token)
228                        # expressions in the base list are not supported
229                    inherit = names
230                cur_class = Class(fullmodule, class_name, inherit,
231                                  fname, lineno)
232                if not stack:
233                    dict[class_name] = cur_class
234                stack.append((cur_class, thisindent))
235            elif token == 'import' and start[1] == 0:
236                modules = _getnamelist(g)
237                for mod, _mod2 in modules:
238                    try:
239                        # Recursively read the imported module
240                        if inpackage is None:
241                            _readmodule(mod, path)
242                        else:
243                            try:
244                                _readmodule(mod, path, inpackage)
245                            except ImportError:
246                                _readmodule(mod, [])
247                    except:
248                        # If we can't find or parse the imported module,
249                        # too bad -- don't die here.
250                        pass
251            elif token == 'from' and start[1] == 0:
252                mod, token = _getname(g)
253                if not mod or token != "import":
254                    continue
255                names = _getnamelist(g)
256                try:
257                    # Recursively read the imported module
258                    d = _readmodule(mod, path, inpackage)
259                except:
260                    # If we can't find or parse the imported module,
261                    # too bad -- don't die here.
262                    continue
263                # add any classes that were defined in the imported module
264                # to our name space if they were mentioned in the list
265                for n, n2 in names:
266                    if n in d:
267                        dict[n2 or n] = d[n]
268                    elif n == '*':
269                        # don't add names that start with _
270                        for n in d:
271                            if n[0] != '_':
272                                dict[n] = d[n]
273    except StopIteration:
274        pass
275
276    f.close()
277    return dict
278
279def _getnamelist(g):
280    # Helper to get a comma-separated list of dotted names plus 'as'
281    # clauses.  Return a list of pairs (name, name2) where name2 is
282    # the 'as' name, or None if there is no 'as' clause.
283    names = []
284    while True:
285        name, token = _getname(g)
286        if not name:
287            break
288        if token == 'as':
289            name2, token = _getname(g)
290        else:
291            name2 = None
292        names.append((name, name2))
293        while token != "," and "\n" not in token:
294            token = g.next()[1]
295        if token != ",":
296            break
297    return names
298
299def _getname(g):
300    # Helper to get a dotted name, return a pair (name, token) where
301    # name is the dotted name, or None if there was no dotted name,
302    # and token is the next input token.
303    parts = []
304    tokentype, token = g.next()[0:2]
305    if tokentype != NAME and token != '*':
306        return (None, token)
307    parts.append(token)
308    while True:
309        tokentype, token = g.next()[0:2]
310        if token != '.':
311            break
312        tokentype, token = g.next()[0:2]
313        if tokentype != NAME:
314            break
315        parts.append(token)
316    return (".".join(parts), token)
317
318def _main():
319    # Main program for testing.
320    import os
321    mod = sys.argv[1]
322    if os.path.exists(mod):
323        path = [os.path.dirname(mod)]
324        mod = os.path.basename(mod)
325        if mod.lower().endswith(".py"):
326            mod = mod[:-3]
327    else:
328        path = []
329    dict = readmodule_ex(mod, path)
330    objs = dict.values()
331    objs.sort(lambda a, b: cmp(getattr(a, 'lineno', 0),
332                               getattr(b, 'lineno', 0)))
333    for obj in objs:
334        if isinstance(obj, Class):
335            print "class", obj.name, obj.super, obj.lineno
336            methods = sorted(obj.methods.iteritems(), key=itemgetter(1))
337            for name, lineno in methods:
338                if name != "__path__":
339                    print "  def", name, lineno
340        elif isinstance(obj, Function):
341            print "def", obj.name, obj.lineno
342
343if __name__ == "__main__":
344    _main()
345