1# (c) 2005 Ian Bicking and contributors; written for Paste (http://pythonpaste.org)
2# Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php
3"""
4WSGI applications that parse the URL and dispatch to on-disk resources
5"""
6
7import os
8import six
9import sys
10import imp
11import mimetypes
12try:
13    import pkg_resources
14except ImportError:
15    pkg_resources = None
16from paste import request
17from paste import fileapp
18from paste.util import import_string
19from paste import httpexceptions
20from .httpheaders import ETAG
21from paste.util import converters
22
23class NoDefault(object):
24    pass
25
26__all__ = ['URLParser', 'StaticURLParser', 'PkgResourcesParser']
27
28class URLParser(object):
29
30    """
31    WSGI middleware
32
33    Application dispatching, based on URL.  An instance of `URLParser` is
34    an application that loads and delegates to other applications.  It
35    looks for files in its directory that match the first part of
36    PATH_INFO; these may have an extension, but are not required to have
37    one, in which case the available files are searched to find the
38    appropriate file.  If it is ambiguous, a 404 is returned and an error
39    logged.
40
41    By default there is a constructor for .py files that loads the module,
42    and looks for an attribute ``application``, which is a ready
43    application object, or an attribute that matches the module name,
44    which is a factory for building applications, and is called with no
45    arguments.
46
47    URLParser will also look in __init__.py for special overrides.
48    These overrides are:
49
50    ``urlparser_hook(environ)``
51        This can modify the environment.  Its return value is ignored,
52        and it cannot be used to change the response in any way.  You
53        *can* use this, for example, to manipulate SCRIPT_NAME/PATH_INFO
54        (try to keep them consistent with the original URL -- but
55        consuming PATH_INFO and moving that to SCRIPT_NAME is ok).
56
57    ``urlparser_wrap(environ, start_response, app)``:
58        After URLParser finds the application, it calls this function
59        (if present).  If this function doesn't call
60        ``app(environ, start_response)`` then the application won't be
61        called at all!  This can be used to allocate resources (with
62        ``try:finally:``) or otherwise filter the output of the
63        application.
64
65    ``not_found_hook(environ, start_response)``:
66        If no file can be found (*in this directory*) to match the
67        request, then this WSGI application will be called.  You can
68        use this to change the URL and pass the request back to
69        URLParser again, or on to some other application.  This
70        doesn't catch all ``404 Not Found`` responses, just missing
71        files.
72
73    ``application(environ, start_response)``:
74        This basically overrides URLParser completely, and the given
75        application is used for all requests.  ``urlparser_wrap`` and
76        ``urlparser_hook`` are still called, but the filesystem isn't
77        searched in any way.
78    """
79
80    parsers_by_directory = {}
81
82    # This is lazily initialized
83    init_module = NoDefault
84
85    global_constructors = {}
86
87    def __init__(self, global_conf,
88                 directory, base_python_name,
89                 index_names=NoDefault,
90                 hide_extensions=NoDefault,
91                 ignore_extensions=NoDefault,
92                 constructors=None,
93                 **constructor_conf):
94        """
95        Create a URLParser object that looks at `directory`.
96        `base_python_name` is the package that this directory
97        represents, thus any Python modules in this directory will
98        be given names under this package.
99        """
100        if global_conf:
101            import warnings
102            warnings.warn(
103                'The global_conf argument to URLParser is deprecated; '
104                'either pass in None or {}, or use make_url_parser',
105                DeprecationWarning)
106        else:
107            global_conf = {}
108        if os.path.sep != '/':
109            directory = directory.replace(os.path.sep, '/')
110        self.directory = directory
111        self.base_python_name = base_python_name
112        # This logic here should be deprecated since it is in
113        # make_url_parser
114        if index_names is NoDefault:
115            index_names = global_conf.get(
116                'index_names', ('index', 'Index', 'main', 'Main'))
117        self.index_names = converters.aslist(index_names)
118        if hide_extensions is NoDefault:
119            hide_extensions = global_conf.get(
120                'hide_extensions', ('.pyc', '.bak', '.py~', '.pyo'))
121        self.hide_extensions = converters.aslist(hide_extensions)
122        if ignore_extensions is NoDefault:
123            ignore_extensions = global_conf.get(
124                'ignore_extensions', ())
125        self.ignore_extensions = converters.aslist(ignore_extensions)
126        self.constructors = self.global_constructors.copy()
127        if constructors:
128            self.constructors.update(constructors)
129        # @@: Should we also check the global options for constructors?
130        for name, value in constructor_conf.items():
131            if not name.startswith('constructor '):
132                raise ValueError(
133                    "Only extra configuration keys allowed are "
134                    "'constructor .ext = import_expr'; you gave %r "
135                    "(=%r)" % (name, value))
136            ext = name[len('constructor '):].strip()
137            if isinstance(value, (str, unicode)):
138                value = import_string.eval_import(value)
139            self.constructors[ext] = value
140
141    def __call__(self, environ, start_response):
142        environ['paste.urlparser.base_python_name'] = self.base_python_name
143        if self.init_module is NoDefault:
144            self.init_module = self.find_init_module(environ)
145        path_info = environ.get('PATH_INFO', '')
146        if not path_info:
147            return self.add_slash(environ, start_response)
148        if (self.init_module
149            and getattr(self.init_module, 'urlparser_hook', None)):
150            self.init_module.urlparser_hook(environ)
151        orig_path_info = environ['PATH_INFO']
152        orig_script_name = environ['SCRIPT_NAME']
153        application, filename = self.find_application(environ)
154        if not application:
155            if (self.init_module
156                and getattr(self.init_module, 'not_found_hook', None)
157                and environ.get('paste.urlparser.not_found_parser') is not self):
158                not_found_hook = self.init_module.not_found_hook
159                environ['paste.urlparser.not_found_parser'] = self
160                environ['PATH_INFO'] = orig_path_info
161                environ['SCRIPT_NAME'] = orig_script_name
162                return not_found_hook(environ, start_response)
163            if filename is None:
164                name, rest_of_path = request.path_info_split(environ['PATH_INFO'])
165                if not name:
166                    name = 'one of %s' % ', '.join(
167                        self.index_names or
168                        ['(no index_names defined)'])
169
170                return self.not_found(
171                    environ, start_response,
172                    'Tried to load %s from directory %s'
173                    % (name, self.directory))
174            else:
175                environ['wsgi.errors'].write(
176                    'Found resource %s, but could not construct application\n'
177                    % filename)
178                return self.not_found(
179                    environ, start_response,
180                    'Tried to load %s from directory %s'
181                    % (filename, self.directory))
182        if (self.init_module
183            and getattr(self.init_module, 'urlparser_wrap', None)):
184            return self.init_module.urlparser_wrap(
185                environ, start_response, application)
186        else:
187            return application(environ, start_response)
188
189    def find_application(self, environ):
190        if (self.init_module
191            and getattr(self.init_module, 'application', None)
192            and not environ.get('paste.urlparser.init_application') == environ['SCRIPT_NAME']):
193            environ['paste.urlparser.init_application'] = environ['SCRIPT_NAME']
194            return self.init_module.application, None
195        name, rest_of_path = request.path_info_split(environ['PATH_INFO'])
196        environ['PATH_INFO'] = rest_of_path
197        if name is not None:
198            environ['SCRIPT_NAME'] = environ.get('SCRIPT_NAME', '') + '/' + name
199        if not name:
200            names = self.index_names
201            for index_name in names:
202                filename = self.find_file(environ, index_name)
203                if filename:
204                    break
205            else:
206                # None of the index files found
207                filename = None
208        else:
209            filename = self.find_file(environ, name)
210        if filename is None:
211            return None, filename
212        else:
213            return self.get_application(environ, filename), filename
214
215    def not_found(self, environ, start_response, debug_message=None):
216        exc = httpexceptions.HTTPNotFound(
217            'The resource at %s could not be found'
218            % request.construct_url(environ),
219            comment=debug_message)
220        return exc.wsgi_application(environ, start_response)
221
222    def add_slash(self, environ, start_response):
223        """
224        This happens when you try to get to a directory
225        without a trailing /
226        """
227        url = request.construct_url(environ, with_query_string=False)
228        url += '/'
229        if environ.get('QUERY_STRING'):
230            url += '?' + environ['QUERY_STRING']
231        exc = httpexceptions.HTTPMovedPermanently(
232            'The resource has moved to %s - you should be redirected '
233            'automatically.' % url,
234            headers=[('location', url)])
235        return exc.wsgi_application(environ, start_response)
236
237    def find_file(self, environ, base_filename):
238        possible = []
239        """Cache a few values to reduce function call overhead"""
240        for filename in os.listdir(self.directory):
241            base, ext = os.path.splitext(filename)
242            full_filename = os.path.join(self.directory, filename)
243            if (ext in self.hide_extensions
244                or not base):
245                continue
246            if filename == base_filename:
247                possible.append(full_filename)
248                continue
249            if ext in self.ignore_extensions:
250                continue
251            if base == base_filename:
252                possible.append(full_filename)
253        if not possible:
254            #environ['wsgi.errors'].write(
255            #    'No file found matching %r in %s\n'
256            #    % (base_filename, self.directory))
257            return None
258        if len(possible) > 1:
259            # If there is an exact match, this isn't 'ambiguous'
260            # per se; it might mean foo.gif and foo.gif.back for
261            # instance
262            if full_filename in possible:
263                return full_filename
264            else:
265                environ['wsgi.errors'].write(
266                    'Ambiguous URL: %s; matches files %s\n'
267                    % (request.construct_url(environ),
268                       ', '.join(possible)))
269            return None
270        return possible[0]
271
272    def get_application(self, environ, filename):
273        if os.path.isdir(filename):
274            t = 'dir'
275        else:
276            t = os.path.splitext(filename)[1]
277        constructor = self.constructors.get(t, self.constructors.get('*'))
278        if constructor is None:
279            #environ['wsgi.errors'].write(
280            #    'No constructor found for %s\n' % t)
281            return constructor
282        app = constructor(self, environ, filename)
283        if app is None:
284            #environ['wsgi.errors'].write(
285            #    'Constructor %s return None for %s\n' %
286            #    (constructor, filename))
287            pass
288        return app
289
290    def register_constructor(cls, extension, constructor):
291        """
292        Register a function as a constructor.  Registered constructors
293        apply to all instances of `URLParser`.
294
295        The extension should have a leading ``.``, or the special
296        extensions ``dir`` (for directories) and ``*`` (a catch-all).
297
298        `constructor` must be a callable that takes two arguments:
299        ``environ`` and ``filename``, and returns a WSGI application.
300        """
301        d = cls.global_constructors
302        assert extension not in d, (
303            "A constructor already exists for the extension %r (%r) "
304            "when attemption to register constructor %r"
305            % (extension, d[extension], constructor))
306        d[extension] = constructor
307    register_constructor = classmethod(register_constructor)
308
309    def get_parser(self, directory, base_python_name):
310        """
311        Get a parser for the given directory, or create one if
312        necessary.  This way parsers can be cached and reused.
313
314        # @@: settings are inherited from the first caller
315        """
316        try:
317            return self.parsers_by_directory[(directory, base_python_name)]
318        except KeyError:
319            parser = self.__class__(
320                {},
321                directory, base_python_name,
322                index_names=self.index_names,
323                hide_extensions=self.hide_extensions,
324                ignore_extensions=self.ignore_extensions,
325                constructors=self.constructors)
326            self.parsers_by_directory[(directory, base_python_name)] = parser
327            return parser
328
329    def find_init_module(self, environ):
330        filename = os.path.join(self.directory, '__init__.py')
331        if not os.path.exists(filename):
332            return None
333        return load_module(environ, filename)
334
335    def __repr__(self):
336        return '<%s directory=%r; module=%s at %s>' % (
337            self.__class__.__name__,
338            self.directory,
339            self.base_python_name,
340            hex(abs(id(self))))
341
342def make_directory(parser, environ, filename):
343    base_python_name = environ['paste.urlparser.base_python_name']
344    if base_python_name:
345        base_python_name += "." + os.path.basename(filename)
346    else:
347        base_python_name = os.path.basename(filename)
348    return parser.get_parser(filename, base_python_name)
349
350URLParser.register_constructor('dir', make_directory)
351
352def make_unknown(parser, environ, filename):
353    return fileapp.FileApp(filename)
354
355URLParser.register_constructor('*', make_unknown)
356
357def load_module(environ, filename):
358    base_python_name = environ['paste.urlparser.base_python_name']
359    module_name = os.path.splitext(os.path.basename(filename))[0]
360    if base_python_name:
361        module_name = base_python_name + '.' + module_name
362    return load_module_from_name(environ, filename, module_name,
363                                 environ['wsgi.errors'])
364
365def load_module_from_name(environ, filename, module_name, errors):
366    if module_name in sys.modules:
367        return sys.modules[module_name]
368    init_filename = os.path.join(os.path.dirname(filename), '__init__.py')
369    if not os.path.exists(init_filename):
370        try:
371            f = open(init_filename, 'w')
372        except (OSError, IOError) as e:
373            errors.write(
374                'Cannot write __init__.py file into directory %s (%s)\n'
375                % (os.path.dirname(filename), e))
376            return None
377        f.write('#\n')
378        f.close()
379    fp = None
380    if module_name in sys.modules:
381        return sys.modules[module_name]
382    if '.' in module_name:
383        parent_name = '.'.join(module_name.split('.')[:-1])
384        base_name = module_name.split('.')[-1]
385        parent = load_module_from_name(environ, os.path.dirname(filename),
386                                       parent_name, errors)
387    else:
388        base_name = module_name
389    fp = None
390    try:
391        fp, pathname, stuff = imp.find_module(
392            base_name, [os.path.dirname(filename)])
393        module = imp.load_module(module_name, fp, pathname, stuff)
394    finally:
395        if fp is not None:
396            fp.close()
397    return module
398
399def make_py(parser, environ, filename):
400    module = load_module(environ, filename)
401    if not module:
402        return None
403    if hasattr(module, 'application') and module.application:
404        return getattr(module.application, 'wsgi_application', module.application)
405    base_name = module.__name__.split('.')[-1]
406    if hasattr(module, base_name):
407        obj = getattr(module, base_name)
408        if hasattr(obj, 'wsgi_application'):
409            return obj.wsgi_application
410        else:
411            # @@: Old behavior; should probably be deprecated eventually:
412            return getattr(module, base_name)()
413    environ['wsgi.errors'].write(
414        "Cound not find application or %s in %s\n"
415        % (base_name, module))
416    return None
417
418URLParser.register_constructor('.py', make_py)
419
420class StaticURLParser(object):
421    """
422    Like ``URLParser`` but only serves static files.
423
424    ``cache_max_age``:
425      integer specifies Cache-Control max_age in seconds
426    """
427    # @@: Should URLParser subclass from this?
428
429    def __init__(self, directory, root_directory=None,
430                 cache_max_age=None):
431        self.directory = self.normpath(directory)
432        self.root_directory = self.normpath(root_directory or directory)
433        self.cache_max_age = cache_max_age
434
435    def normpath(path):
436        return os.path.normcase(os.path.abspath(path))
437    normpath = staticmethod(normpath)
438
439    def __call__(self, environ, start_response):
440        path_info = environ.get('PATH_INFO', '')
441        if not path_info:
442            return self.add_slash(environ, start_response)
443        if path_info == '/':
444            # @@: This should obviously be configurable
445            filename = 'index.html'
446        else:
447            filename = request.path_info_pop(environ)
448        full = self.normpath(os.path.join(self.directory, filename))
449        if not full.startswith(self.root_directory):
450            # Out of bounds
451            return self.not_found(environ, start_response)
452        if not os.path.exists(full):
453            return self.not_found(environ, start_response)
454        if os.path.isdir(full):
455            # @@: Cache?
456            return self.__class__(full, root_directory=self.root_directory,
457                                  cache_max_age=self.cache_max_age)(environ,
458                                                                   start_response)
459        if environ.get('PATH_INFO') and environ.get('PATH_INFO') != '/':
460            return self.error_extra_path(environ, start_response)
461        if_none_match = environ.get('HTTP_IF_NONE_MATCH')
462        if if_none_match:
463            mytime = os.stat(full).st_mtime
464            if str(mytime) == if_none_match:
465                headers = []
466                ## FIXME: probably should be
467                ## ETAG.update(headers, '"%s"' % mytime)
468                ETAG.update(headers, mytime)
469                start_response('304 Not Modified', headers)
470                return [''] # empty body
471
472        fa = self.make_app(full)
473        if self.cache_max_age:
474            fa.cache_control(max_age=self.cache_max_age)
475        return fa(environ, start_response)
476
477    def make_app(self, filename):
478        return fileapp.FileApp(filename)
479
480    def add_slash(self, environ, start_response):
481        """
482        This happens when you try to get to a directory
483        without a trailing /
484        """
485        url = request.construct_url(environ, with_query_string=False)
486        url += '/'
487        if environ.get('QUERY_STRING'):
488            url += '?' + environ['QUERY_STRING']
489        exc = httpexceptions.HTTPMovedPermanently(
490            'The resource has moved to %s - you should be redirected '
491            'automatically.' % url,
492            headers=[('location', url)])
493        return exc.wsgi_application(environ, start_response)
494
495    def not_found(self, environ, start_response, debug_message=None):
496        exc = httpexceptions.HTTPNotFound(
497            'The resource at %s could not be found'
498            % request.construct_url(environ),
499            comment='SCRIPT_NAME=%r; PATH_INFO=%r; looking in %r; debug: %s'
500            % (environ.get('SCRIPT_NAME'), environ.get('PATH_INFO'),
501               self.directory, debug_message or '(none)'))
502        return exc.wsgi_application(environ, start_response)
503
504    def error_extra_path(self, environ, start_response):
505        exc = httpexceptions.HTTPNotFound(
506            'The trailing path %r is not allowed' % environ['PATH_INFO'])
507        return exc.wsgi_application(environ, start_response)
508
509    def __repr__(self):
510        return '<%s %r>' % (self.__class__.__name__, self.directory)
511
512def make_static(global_conf, document_root, cache_max_age=None):
513    """
514    Return a WSGI application that serves a directory (configured
515    with document_root)
516
517    cache_max_age - integer specifies CACHE_CONTROL max_age in seconds
518    """
519    if cache_max_age is not None:
520        cache_max_age = int(cache_max_age)
521    return StaticURLParser(
522        document_root, cache_max_age=cache_max_age)
523
524class PkgResourcesParser(StaticURLParser):
525
526    def __init__(self, egg_or_spec, resource_name, manager=None, root_resource=None):
527        if pkg_resources is None:
528            raise NotImplementedError("This class requires pkg_resources.")
529        if isinstance(egg_or_spec, (six.binary_type, six.text_type)):
530            self.egg = pkg_resources.get_distribution(egg_or_spec)
531        else:
532            self.egg = egg_or_spec
533        self.resource_name = resource_name
534        if manager is None:
535            manager = pkg_resources.ResourceManager()
536        self.manager = manager
537        if root_resource is None:
538            root_resource = resource_name
539        self.root_resource = os.path.normpath(root_resource)
540
541    def __repr__(self):
542        return '<%s for %s:%r>' % (
543            self.__class__.__name__,
544            self.egg.project_name,
545            self.resource_name)
546
547    def __call__(self, environ, start_response):
548        path_info = environ.get('PATH_INFO', '')
549        if not path_info:
550            return self.add_slash(environ, start_response)
551        if path_info == '/':
552            # @@: This should obviously be configurable
553            filename = 'index.html'
554        else:
555            filename = request.path_info_pop(environ)
556        resource = os.path.normcase(os.path.normpath(
557                    self.resource_name + '/' + filename))
558        if self.root_resource is not None and not resource.startswith(self.root_resource):
559            # Out of bounds
560            return self.not_found(environ, start_response)
561        if not self.egg.has_resource(resource):
562            return self.not_found(environ, start_response)
563        if self.egg.resource_isdir(resource):
564            # @@: Cache?
565            child_root = self.root_resource is not None and self.root_resource or \
566                self.resource_name
567            return self.__class__(self.egg, resource, self.manager,
568                                  root_resource=child_root)(environ, start_response)
569        if environ.get('PATH_INFO') and environ.get('PATH_INFO') != '/':
570            return self.error_extra_path(environ, start_response)
571
572        type, encoding = mimetypes.guess_type(resource)
573        if not type:
574            type = 'application/octet-stream'
575        # @@: I don't know what to do with the encoding.
576        try:
577            file = self.egg.get_resource_stream(self.manager, resource)
578        except (IOError, OSError) as e:
579            exc = httpexceptions.HTTPForbidden(
580                'You are not permitted to view this file (%s)' % e)
581            return exc.wsgi_application(environ, start_response)
582        start_response('200 OK',
583                       [('content-type', type)])
584        return fileapp._FileIter(file)
585
586    def not_found(self, environ, start_response, debug_message=None):
587        exc = httpexceptions.HTTPNotFound(
588            'The resource at %s could not be found'
589            % request.construct_url(environ),
590            comment='SCRIPT_NAME=%r; PATH_INFO=%r; looking in egg:%s#%r; debug: %s'
591            % (environ.get('SCRIPT_NAME'), environ.get('PATH_INFO'),
592               self.egg, self.resource_name, debug_message or '(none)'))
593        return exc.wsgi_application(environ, start_response)
594
595def make_pkg_resources(global_conf, egg, resource_name=''):
596    """
597    A static file parser that loads data from an egg using
598    ``pkg_resources``.  Takes a configuration value ``egg``, which is
599    an egg spec, and a base ``resource_name`` (default empty string)
600    which is the path in the egg that this starts at.
601    """
602    if pkg_resources is None:
603        raise NotImplementedError("This function requires pkg_resources.")
604    return PkgResourcesParser(egg, resource_name)
605
606def make_url_parser(global_conf, directory, base_python_name,
607                    index_names=None, hide_extensions=None,
608                    ignore_extensions=None,
609                    **constructor_conf):
610    """
611    Create a URLParser application that looks in ``directory``, which
612    should be the directory for the Python package named in
613    ``base_python_name``.  ``index_names`` are used when viewing the
614    directory (like ``'index'`` for ``'index.html'``).
615    ``hide_extensions`` are extensions that are not viewable (like
616    ``'.pyc'``) and ``ignore_extensions`` are viewable but only if an
617    explicit extension is given.
618    """
619    if index_names is None:
620        index_names = global_conf.get(
621            'index_names', ('index', 'Index', 'main', 'Main'))
622    index_names = converters.aslist(index_names)
623
624    if hide_extensions is None:
625        hide_extensions = global_conf.get(
626            'hide_extensions', ('.pyc', 'bak', 'py~'))
627    hide_extensions = converters.aslist(hide_extensions)
628
629    if ignore_extensions is None:
630        ignore_extensions = global_conf.get(
631            'ignore_extensions', ())
632    ignore_extensions = converters.aslist(ignore_extensions)
633    # There's no real way to set constructors currently...
634
635    return URLParser({}, directory, base_python_name,
636                     index_names=index_names,
637                     hide_extensions=hide_extensions,
638                     ignore_extensions=ignore_extensions,
639                     **constructor_conf)
640