| # |
| # ElementTree |
| # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $ |
| # |
| # limited xpath support for element trees |
| # |
| # history: |
| # 2003-05-23 fl created |
| # 2003-05-28 fl added support for // etc |
| # 2003-08-27 fl fixed parsing of periods in element names |
| # 2007-09-10 fl new selection engine |
| # 2007-09-12 fl fixed parent selector |
| # 2007-09-13 fl added iterfind; changed findall to return a list |
| # 2007-11-30 fl added namespaces support |
| # 2009-10-30 fl added child element value filter |
| # |
| # Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved. |
| # |
| # [email protected] |
| # http://www.pythonware.com |
| # |
| # -------------------------------------------------------------------- |
| # The ElementTree toolkit is |
| # |
| # Copyright (c) 1999-2009 by Fredrik Lundh |
| # |
| # By obtaining, using, and/or copying this software and/or its |
| # associated documentation, you agree that you have read, understood, |
| # and will comply with the following terms and conditions: |
| # |
| # Permission to use, copy, modify, and distribute this software and |
| # its associated documentation for any purpose and without fee is |
| # hereby granted, provided that the above copyright notice appears in |
| # all copies, and that both that copyright notice and this permission |
| # notice appear in supporting documentation, and that the name of |
| # Secret Labs AB or the author not be used in advertising or publicity |
| # pertaining to distribution of the software without specific, written |
| # prior permission. |
| # |
| # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD |
| # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- |
| # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR |
| # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY |
| # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, |
| # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS |
| # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE |
| # OF THIS SOFTWARE. |
| # -------------------------------------------------------------------- |
| |
| # Licensed to PSF under a Contributor Agreement. |
| # See http://www.python.org/psf/license for licensing details. |
| |
| ## |
| # Implementation module for XPath support. There's usually no reason |
| # to import this module directly; the <b>ElementTree</b> does this for |
| # you, if needed. |
| ## |
| |
| import re |
| |
| xpath_tokenizer_re = re.compile( |
| r"(" |
| r"'[^']*'|\"[^\"]*\"|" |
| r"::|" |
| r"//?|" |
| r"\.\.|" |
| r"\(\)|" |
| r"[/.*:\[\]\(\)@=])|" |
| r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|" |
| r"\s+" |
| ) |
| |
| def xpath_tokenizer(pattern, namespaces=None): |
| default_namespace = namespaces.get('') if namespaces else None |
| parsing_attribute = False |
| for token in xpath_tokenizer_re.findall(pattern): |
| ttype, tag = token |
| if tag and tag[0] != "{": |
| if ":" in tag: |
| prefix, uri = tag.split(":", 1) |
| try: |
| if not namespaces: |
| raise KeyError |
| yield ttype, "{%s}%s" % (namespaces[prefix], uri) |
| except KeyError: |
| raise SyntaxError("prefix %r not found in prefix map" % prefix) from None |
| elif default_namespace and not parsing_attribute: |
| yield ttype, "{%s}%s" % (default_namespace, tag) |
| else: |
| yield token |
| parsing_attribute = False |
| else: |
| yield token |
| parsing_attribute = ttype == '@' |
| |
| |
| def get_parent_map(context): |
| parent_map = context.parent_map |
| if parent_map is None: |
| context.parent_map = parent_map = {} |
| for p in context.root.iter(): |
| for e in p: |
| parent_map[e] = p |
| return parent_map |
| |
| |
| def _is_wildcard_tag(tag): |
| return tag[:3] == '{*}' or tag[-2:] == '}*' |
| |
| |
| def _prepare_tag(tag): |
| _isinstance, _str = isinstance, str |
| if tag == '{*}*': |
| # Same as '*', but no comments or processing instructions. |
| # It can be a surprise that '*' includes those, but there is no |
| # justification for '{*}*' doing the same. |
| def select(context, result): |
| for elem in result: |
| if _isinstance(elem.tag, _str): |
| yield elem |
| elif tag == '{}*': |
| # Any tag that is not in a namespace. |
| def select(context, result): |
| for elem in result: |
| el_tag = elem.tag |
| if _isinstance(el_tag, _str) and el_tag[0] != '{': |
| yield elem |
| elif tag[:3] == '{*}': |
| # The tag in any (or no) namespace. |
| suffix = tag[2:] # '}name' |
| no_ns = slice(-len(suffix), None) |
| tag = tag[3:] |
| def select(context, result): |
| for elem in result: |
| el_tag = elem.tag |
| if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix: |
| yield elem |
| elif tag[-2:] == '}*': |
| # Any tag in the given namespace. |
| ns = tag[:-1] |
| ns_only = slice(None, len(ns)) |
| def select(context, result): |
| for elem in result: |
| el_tag = elem.tag |
| if _isinstance(el_tag, _str) and el_tag[ns_only] == ns: |
| yield elem |
| else: |
| raise RuntimeError(f"internal parser error, got {tag}") |
| return select |
| |
| |
| def prepare_child(next, token): |
| tag = token[1] |
| if _is_wildcard_tag(tag): |
| select_tag = _prepare_tag(tag) |
| def select(context, result): |
| def select_child(result): |
| for elem in result: |
| yield from elem |
| return select_tag(context, select_child(result)) |
| else: |
| if tag[:2] == '{}': |
| tag = tag[2:] # '{}tag' == 'tag' |
| def select(context, result): |
| for elem in result: |
| for e in elem: |
| if e.tag == tag: |
| yield e |
| return select |
| |
| def prepare_star(next, token): |
| def select(context, result): |
| for elem in result: |
| yield from elem |
| return select |
| |
| def prepare_self(next, token): |
| def select(context, result): |
| yield from result |
| return select |
| |
| def prepare_descendant(next, token): |
| try: |
| token = next() |
| except StopIteration: |
| return |
| if token[0] == "*": |
| tag = "*" |
| elif not token[0]: |
| tag = token[1] |
| else: |
| raise SyntaxError("invalid descendant") |
| |
| if _is_wildcard_tag(tag): |
| select_tag = _prepare_tag(tag) |
| def select(context, result): |
| def select_child(result): |
| for elem in result: |
| for e in elem.iter(): |
| if e is not elem: |
| yield e |
| return select_tag(context, select_child(result)) |
| else: |
| if tag[:2] == '{}': |
| tag = tag[2:] # '{}tag' == 'tag' |
| def select(context, result): |
| for elem in result: |
| for e in elem.iter(tag): |
| if e is not elem: |
| yield e |
| return select |
| |
| def prepare_parent(next, token): |
| def select(context, result): |
| # FIXME: raise error if .. is applied at toplevel? |
| parent_map = get_parent_map(context) |
| result_map = {} |
| for elem in result: |
| if elem in parent_map: |
| parent = parent_map[elem] |
| if parent not in result_map: |
| result_map[parent] = None |
| yield parent |
| return select |
| |
| def prepare_predicate(next, token): |
| # FIXME: replace with real parser!!! refs: |
| # http://effbot.org/zone/simple-iterator-parser.htm |
| # http://javascript.crockford.com/tdop/tdop.html |
| signature = [] |
| predicate = [] |
| while 1: |
| try: |
| token = next() |
| except StopIteration: |
| return |
| if token[0] == "]": |
| break |
| if token == ('', ''): |
| # ignore whitespace |
| continue |
| if token[0] and token[0][:1] in "'\"": |
| token = "'", token[0][1:-1] |
| signature.append(token[0] or "-") |
| predicate.append(token[1]) |
| signature = "".join(signature) |
| # use signature to determine predicate type |
| if signature == "@-": |
| # [@attribute] predicate |
| key = predicate[1] |
| def select(context, result): |
| for elem in result: |
| if elem.get(key) is not None: |
| yield elem |
| return select |
| if signature == "@-='": |
| # [@attribute='value'] |
| key = predicate[1] |
| value = predicate[-1] |
| def select(context, result): |
| for elem in result: |
| if elem.get(key) == value: |
| yield elem |
| return select |
| if signature == "-" and not re.match(r"\-?\d+$", predicate[0]): |
| # [tag] |
| tag = predicate[0] |
| def select(context, result): |
| for elem in result: |
| if elem.find(tag) is not None: |
| yield elem |
| return select |
| if signature == ".='" or (signature == "-='" and not re.match(r"\-?\d+$", predicate[0])): |
| # [.='value'] or [tag='value'] |
| tag = predicate[0] |
| value = predicate[-1] |
| if tag: |
| def select(context, result): |
| for elem in result: |
| for e in elem.findall(tag): |
| if "".join(e.itertext()) == value: |
| yield elem |
| break |
| else: |
| def select(context, result): |
| for elem in result: |
| if "".join(elem.itertext()) == value: |
| yield elem |
| return select |
| if signature == "-" or signature == "-()" or signature == "-()-": |
| # [index] or [last()] or [last()-index] |
| if signature == "-": |
| # [index] |
| index = int(predicate[0]) - 1 |
| if index < 0: |
| raise SyntaxError("XPath position >= 1 expected") |
| else: |
| if predicate[0] != "last": |
| raise SyntaxError("unsupported function") |
| if signature == "-()-": |
| try: |
| index = int(predicate[2]) - 1 |
| except ValueError: |
| raise SyntaxError("unsupported expression") |
| if index > -2: |
| raise SyntaxError("XPath offset from last() must be negative") |
| else: |
| index = -1 |
| def select(context, result): |
| parent_map = get_parent_map(context) |
| for elem in result: |
| try: |
| parent = parent_map[elem] |
| # FIXME: what if the selector is "*" ? |
| elems = list(parent.findall(elem.tag)) |
| if elems[index] is elem: |
| yield elem |
| except (IndexError, KeyError): |
| pass |
| return select |
| raise SyntaxError("invalid predicate") |
| |
| ops = { |
| "": prepare_child, |
| "*": prepare_star, |
| ".": prepare_self, |
| "..": prepare_parent, |
| "//": prepare_descendant, |
| "[": prepare_predicate, |
| } |
| |
| _cache = {} |
| |
| class _SelectorContext: |
| parent_map = None |
| def __init__(self, root): |
| self.root = root |
| |
| # -------------------------------------------------------------------- |
| |
| ## |
| # Generate all matching objects. |
| |
| def iterfind(elem, path, namespaces=None): |
| # compile selector pattern |
| if path[-1:] == "/": |
| path = path + "*" # implicit all (FIXME: keep this?) |
| |
| cache_key = (path,) |
| if namespaces: |
| cache_key += tuple(sorted(namespaces.items())) |
| |
| try: |
| selector = _cache[cache_key] |
| except KeyError: |
| if len(_cache) > 100: |
| _cache.clear() |
| if path[:1] == "/": |
| raise SyntaxError("cannot use absolute path on element") |
| next = iter(xpath_tokenizer(path, namespaces)).__next__ |
| try: |
| token = next() |
| except StopIteration: |
| return |
| selector = [] |
| while 1: |
| try: |
| selector.append(ops[token[0]](next, token)) |
| except StopIteration: |
| raise SyntaxError("invalid path") from None |
| try: |
| token = next() |
| if token[0] == "/": |
| token = next() |
| except StopIteration: |
| break |
| _cache[cache_key] = selector |
| # execute selector pattern |
| result = [elem] |
| context = _SelectorContext(elem) |
| for select in selector: |
| result = select(context, result) |
| return result |
| |
| ## |
| # Find first matching object. |
| |
| def find(elem, path, namespaces=None): |
| return next(iterfind(elem, path, namespaces), None) |
| |
| ## |
| # Find all matching objects. |
| |
| def findall(elem, path, namespaces=None): |
| return list(iterfind(elem, path, namespaces)) |
| |
| ## |
| # Find text for first matching object. |
| |
| def findtext(elem, path, default=None, namespaces=None): |
| try: |
| elem = next(iterfind(elem, path, namespaces)) |
| return elem.text or "" |
| except StopIteration: |
| return default |