190f5ba538bf40bcf4fd41049c7bf4296d3ffc9c7Benjamin Peterson#!/usr/bin/env python3
2568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
3568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl"""\
4568973181aa523bbcf7f827b3a2eb2affd96ea67Georg BrandlList python source files.
5568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
6568973181aa523bbcf7f827b3a2eb2affd96ea67Georg BrandlThere are three functions to check whether a file is a Python source, listed
7568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandlhere with increasing complexity:
8568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
9568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl- has_python_ext() checks whether a file name ends in '.py[w]'.
10568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl- look_like_python() checks whether the file is not binary and either has
11568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl  the '.py[w]' extension or the first line contains the word 'python'.
12568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl- can_be_compiled() checks whether the file can be compiled by compile().
13568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
14568973181aa523bbcf7f827b3a2eb2affd96ea67Georg BrandlThe file also must be of appropriate size - not bigger than a megabyte.
15568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
16568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandlwalk_python_files() recursively lists all Python files under the given directories.
17568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl"""
1889f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters__author__ = "Oleg Broytmann, Georg Brandl"
19568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
20568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl__all__ = ["has_python_ext", "looks_like_python", "can_be_compiled", "walk_python_files"]
21568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
22568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
2305e8be17fd15d8e649e026600f5ab20e1154599fChristian Heimesimport os, re
24568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
2598516a6930acdd39f74c609817cbf19be6c4a7dfVictor Stinnerbinary_re = re.compile(br'[\x00-\x08\x0E-\x1F\x7F]')
26568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
27568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandldebug = False
28568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
29568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandldef print_debug(msg):
306afaeb757af0dbd8508a0f2352ade61e41bec84cCollin Winter    if debug: print(msg)
31568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
32568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
33568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandldef _open(fullpath):
34568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    try:
35568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        size = os.stat(fullpath).st_size
36b940e113bf90ff71b0ef57414ea2beea9d2a4bc0Guido van Rossum    except OSError as err: # Permission denied - ignore the file
37568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        print_debug("%s: permission denied: %s" % (fullpath, err))
38568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        return None
39568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
40568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    if size > 1024*1024: # too big
41568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        print_debug("%s: the file is too big: %d bytes" % (fullpath, size))
42568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        return None
43568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
44568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    try:
4598516a6930acdd39f74c609817cbf19be6c4a7dfVictor Stinner        return open(fullpath, "rb")
46b940e113bf90ff71b0ef57414ea2beea9d2a4bc0Guido van Rossum    except IOError as err: # Access denied, or a special file - ignore it
47568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        print_debug("%s: access denied: %s" % (fullpath, err))
48568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        return None
49568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
50568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandldef has_python_ext(fullpath):
51568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    return fullpath.endswith(".py") or fullpath.endswith(".pyw")
52568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
53568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandldef looks_like_python(fullpath):
54568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    infile = _open(fullpath)
55568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    if infile is None:
56568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        return False
57568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
58a90f311d0592f6ab56068441413a1925bd7393f4Victor Stinner    with infile:
59a90f311d0592f6ab56068441413a1925bd7393f4Victor Stinner        line = infile.readline()
609e34c047325651853a95f95e538582a4f6d5b7f6Tim Peters
61568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    if binary_re.search(line):
62568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        # file appears to be binary
63568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        print_debug("%s: appears to be binary" % fullpath)
64568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        return False
659e34c047325651853a95f95e538582a4f6d5b7f6Tim Peters
66568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    if fullpath.endswith(".py") or fullpath.endswith(".pyw"):
67568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        return True
6898516a6930acdd39f74c609817cbf19be6c4a7dfVictor Stinner    elif b"python" in line:
69568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        # disguised Python script (e.g. CGI)
70568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        return True
71568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
72568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    return False
73568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
74568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandldef can_be_compiled(fullpath):
75568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    infile = _open(fullpath)
76568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    if infile is None:
77568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        return False
78568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
79a90f311d0592f6ab56068441413a1925bd7393f4Victor Stinner    with infile:
80a90f311d0592f6ab56068441413a1925bd7393f4Victor Stinner        code = infile.read()
81568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
82568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    try:
83568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        compile(code, fullpath, "exec")
84b940e113bf90ff71b0ef57414ea2beea9d2a4bc0Guido van Rossum    except Exception as err:
85568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        print_debug("%s: cannot compile: %s" % (fullpath, err))
86568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        return False
87568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
88568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    return True
89568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
90568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
91568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandldef walk_python_files(paths, is_python=looks_like_python, exclude_dirs=None):
92568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    """\
93568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    Recursively yield all Python source files below the given paths.
94568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
95568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    paths: a list of files and/or directories to be checked.
96568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    is_python: a function that takes a file name and checks whether it is a
97568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl               Python source file
989e34c047325651853a95f95e538582a4f6d5b7f6Tim Peters    exclude_dirs: a list of directory base names that should be excluded in
99568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl                  the search
100568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    """
101568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    if exclude_dirs is None:
102568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        exclude_dirs=[]
1039e34c047325651853a95f95e538582a4f6d5b7f6Tim Peters
104568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    for path in paths:
105568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        print_debug("testing: %s" % path)
106568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        if os.path.isfile(path):
107568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl            if is_python(path):
108568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl                yield path
109568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        elif os.path.isdir(path):
110568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl            print_debug("    it is a directory")
111568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl            for dirpath, dirnames, filenames in os.walk(path):
112568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl                for exclude in exclude_dirs:
113568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl                    if exclude in dirnames:
114568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl                        dirnames.remove(exclude)
115568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl                for filename in filenames:
116568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl                    fullpath = os.path.join(dirpath, filename)
117568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl                    print_debug("testing: %s" % fullpath)
118568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl                    if is_python(fullpath):
119568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl                        yield fullpath
120568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl        else:
121568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl            print_debug("    unknown type")
122568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
123568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl
124568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandlif __name__ == "__main__":
125568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    # Two simple examples/tests
126568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    for fullpath in walk_python_files(['.']):
1276afaeb757af0dbd8508a0f2352ade61e41bec84cCollin Winter        print(fullpath)
1286afaeb757af0dbd8508a0f2352ade61e41bec84cCollin Winter    print("----------")
129568973181aa523bbcf7f827b3a2eb2affd96ea67Georg Brandl    for fullpath in walk_python_files(['.'], is_python=can_be_compiled):
1306afaeb757af0dbd8508a0f2352ade61e41bec84cCollin Winter        print(fullpath)
131