1#
2# ElementTree
3# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
4#
5# limited xpath support for element trees
6#
7# history:
8# 2003-05-23 fl   created
9# 2003-05-28 fl   added support for // etc
10# 2003-08-27 fl   fixed parsing of periods in element names
11# 2007-09-10 fl   new selection engine
12# 2007-09-12 fl   fixed parent selector
13# 2007-09-13 fl   added iterfind; changed findall to return a list
14# 2007-11-30 fl   added namespaces support
15# 2009-10-30 fl   added child element value filter
16#
17# Copyright (c) 2003-2009 by Fredrik Lundh.  All rights reserved.
18#
19# fredrik@pythonware.com
20# http://www.pythonware.com
21#
22# --------------------------------------------------------------------
23# The ElementTree toolkit is
24#
25# Copyright (c) 1999-2009 by Fredrik Lundh
26#
27# By obtaining, using, and/or copying this software and/or its
28# associated documentation, you agree that you have read, understood,
29# and will comply with the following terms and conditions:
30#
31# Permission to use, copy, modify, and distribute this software and
32# its associated documentation for any purpose and without fee is
33# hereby granted, provided that the above copyright notice appears in
34# all copies, and that both that copyright notice and this permission
35# notice appear in supporting documentation, and that the name of
36# Secret Labs AB or the author not be used in advertising or publicity
37# pertaining to distribution of the software without specific, written
38# prior permission.
39#
40# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
41# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
42# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
43# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
44# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
45# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
46# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
47# OF THIS SOFTWARE.
48# --------------------------------------------------------------------
49
50# Licensed to PSF under a Contributor Agreement.
51# See http://www.python.org/psf/license for licensing details.
52
53##
54# Implementation module for XPath support.  There's usually no reason
55# to import this module directly; the <b>ElementTree</b> does this for
56# you, if needed.
57##
58
59import re
60
61xpath_tokenizer_re = re.compile(
62    "("
63    "'[^']*'|\"[^\"]*\"|"
64    "::|"
65    "//?|"
66    "\.\.|"
67    "\(\)|"
68    "[/.*:\[\]\(\)@=])|"
69    "((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
70    "\s+"
71    )
72
73def xpath_tokenizer(pattern, namespaces=None):
74    for token in xpath_tokenizer_re.findall(pattern):
75        tag = token[1]
76        if tag and tag[0] != "{" and ":" in tag:
77            try:
78                prefix, uri = tag.split(":", 1)
79                if not namespaces:
80                    raise KeyError
81                yield token[0], "{%s}%s" % (namespaces[prefix], uri)
82            except KeyError:
83                raise SyntaxError("prefix %r not found in prefix map" % prefix)
84        else:
85            yield token
86
87def get_parent_map(context):
88    parent_map = context.parent_map
89    if parent_map is None:
90        context.parent_map = parent_map = {}
91        for p in context.root.iter():
92            for e in p:
93                parent_map[e] = p
94    return parent_map
95
96def prepare_child(next, token):
97    tag = token[1]
98    def select(context, result):
99        for elem in result:
100            for e in elem:
101                if e.tag == tag:
102                    yield e
103    return select
104
105def prepare_star(next, token):
106    def select(context, result):
107        for elem in result:
108            for e in elem:
109                yield e
110    return select
111
112def prepare_self(next, token):
113    def select(context, result):
114        for elem in result:
115            yield elem
116    return select
117
118def prepare_descendant(next, token):
119    token = next()
120    if token[0] == "*":
121        tag = "*"
122    elif not token[0]:
123        tag = token[1]
124    else:
125        raise SyntaxError("invalid descendant")
126    def select(context, result):
127        for elem in result:
128            for e in elem.iter(tag):
129                if e is not elem:
130                    yield e
131    return select
132
133def prepare_parent(next, token):
134    def select(context, result):
135        # FIXME: raise error if .. is applied at toplevel?
136        parent_map = get_parent_map(context)
137        result_map = {}
138        for elem in result:
139            if elem in parent_map:
140                parent = parent_map[elem]
141                if parent not in result_map:
142                    result_map[parent] = None
143                    yield parent
144    return select
145
146def prepare_predicate(next, token):
147    # FIXME: replace with real parser!!! refs:
148    # http://effbot.org/zone/simple-iterator-parser.htm
149    # http://javascript.crockford.com/tdop/tdop.html
150    signature = []
151    predicate = []
152    while 1:
153        token = next()
154        if token[0] == "]":
155            break
156        if token[0] and token[0][:1] in "'\"":
157            token = "'", token[0][1:-1]
158        signature.append(token[0] or "-")
159        predicate.append(token[1])
160    signature = "".join(signature)
161    # use signature to determine predicate type
162    if signature == "@-":
163        # [@attribute] predicate
164        key = predicate[1]
165        def select(context, result):
166            for elem in result:
167                if elem.get(key) is not None:
168                    yield elem
169        return select
170    if signature == "@-='":
171        # [@attribute='value']
172        key = predicate[1]
173        value = predicate[-1]
174        def select(context, result):
175            for elem in result:
176                if elem.get(key) == value:
177                    yield elem
178        return select
179    if signature == "-" and not re.match("\d+$", predicate[0]):
180        # [tag]
181        tag = predicate[0]
182        def select(context, result):
183            for elem in result:
184                if elem.find(tag) is not None:
185                    yield elem
186        return select
187    if signature == "-='" and not re.match("\d+$", predicate[0]):
188        # [tag='value']
189        tag = predicate[0]
190        value = predicate[-1]
191        def select(context, result):
192            for elem in result:
193                for e in elem.findall(tag):
194                    if "".join(e.itertext()) == value:
195                        yield elem
196                        break
197        return select
198    if signature == "-" or signature == "-()" or signature == "-()-":
199        # [index] or [last()] or [last()-index]
200        if signature == "-":
201            index = int(predicate[0]) - 1
202        else:
203            if predicate[0] != "last":
204                raise SyntaxError("unsupported function")
205            if signature == "-()-":
206                try:
207                    index = int(predicate[2]) - 1
208                except ValueError:
209                    raise SyntaxError("unsupported expression")
210            else:
211                index = -1
212        def select(context, result):
213            parent_map = get_parent_map(context)
214            for elem in result:
215                try:
216                    parent = parent_map[elem]
217                    # FIXME: what if the selector is "*" ?
218                    elems = list(parent.findall(elem.tag))
219                    if elems[index] is elem:
220                        yield elem
221                except (IndexError, KeyError):
222                    pass
223        return select
224    raise SyntaxError("invalid predicate")
225
226ops = {
227    "": prepare_child,
228    "*": prepare_star,
229    ".": prepare_self,
230    "..": prepare_parent,
231    "//": prepare_descendant,
232    "[": prepare_predicate,
233    }
234
235_cache = {}
236
237class _SelectorContext:
238    parent_map = None
239    def __init__(self, root):
240        self.root = root
241
242# --------------------------------------------------------------------
243
244##
245# Generate all matching objects.
246
247def iterfind(elem, path, namespaces=None):
248    # compile selector pattern
249    if path[-1:] == "/":
250        path = path + "*" # implicit all (FIXME: keep this?)
251    try:
252        selector = _cache[path]
253    except KeyError:
254        if len(_cache) > 100:
255            _cache.clear()
256        if path[:1] == "/":
257            raise SyntaxError("cannot use absolute path on element")
258        next = iter(xpath_tokenizer(path, namespaces)).next
259        token = next()
260        selector = []
261        while 1:
262            try:
263                selector.append(ops[token[0]](next, token))
264            except StopIteration:
265                raise SyntaxError("invalid path")
266            try:
267                token = next()
268                if token[0] == "/":
269                    token = next()
270            except StopIteration:
271                break
272        _cache[path] = selector
273    # execute selector pattern
274    result = [elem]
275    context = _SelectorContext(elem)
276    for select in selector:
277        result = select(context, result)
278    return result
279
280##
281# Find first matching object.
282
283def find(elem, path, namespaces=None):
284    try:
285        return iterfind(elem, path, namespaces).next()
286    except StopIteration:
287        return None
288
289##
290# Find all matching objects.
291
292def findall(elem, path, namespaces=None):
293    return list(iterfind(elem, path, namespaces))
294
295##
296# Find text for first matching object.
297
298def findtext(elem, path, default=None, namespaces=None):
299    try:
300        elem = iterfind(elem, path, namespaces).next()
301        return elem.text or ""
302    except StopIteration:
303        return default
304