Source code for lxml.html.html5parser

"""
An interface to html5lib that mimics the lxml.html interface.
"""
import sys
import string

from html5lib import HTMLParser as _HTMLParser
from html5lib.treebuilders.etree_lxml import TreeBuilder
from lxml import etree
from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag

# python3 compatibility
try:
    _strings = basestring
except NameError:
    _strings = (bytes, str)
try:
    from urllib2 import urlopen
except ImportError:
    from urllib.request import urlopen
try:
    from urlparse import urlparse
except ImportError:
    from urllib.parse import urlparse


[docs]class HTMLParser(_HTMLParser):
    """An html5lib HTML parser with lxml as tree."""

    def __init__(self, strict=False, **kwargs):
        _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)


try:
    from html5lib import XHTMLParser as _XHTMLParser
except ImportError:
    pass
else:
    class XHTMLParser(_XHTMLParser):
        """An html5lib XHTML Parser with lxml as tree."""

        def __init__(self, strict=False, **kwargs):
            _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)

    xhtml_parser = XHTMLParser()


[docs]def _find_tag(tree, tag):
    elem = tree.find(tag)
    if elem is not None:
        return elem
    return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))


[docs]def document_fromstring(html, guess_charset=None, parser=None):
    """
    Parse a whole document into a string.

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')

    if parser is None:
        parser = html_parser

    options = {}
    if guess_charset is None and isinstance(html, bytes):
        # html5lib does not accept useChardet as an argument, if it
        # detected the html argument would produce unicode objects.
        guess_charset = True
    if guess_charset is not None:
        options['useChardet'] = guess_charset
    return parser.parse(html, **options).getroot()


[docs]def fragments_fromstring(html, no_leading_text=False,
                         guess_charset=None, parser=None):
    """Parses several HTML elements, returning a list of elements.

    The first item in the list may be a string.  If no_leading_text is true,
    then it will be an error if there is leading text, and it will always be
    a list of only elements.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')

    if parser is None:
        parser = html_parser

    options = {}
    if guess_charset is None and isinstance(html, bytes):
        # html5lib does not accept useChardet as an argument, if it
        # detected the html argument would produce unicode objects.
        guess_charset = False
    if guess_charset is not None:
        options['useChardet'] = guess_charset
    children = parser.parseFragment(html, 'div', **options)
    if children and isinstance(children[0], _strings):
        if no_leading_text:
            if children[0].strip():
                raise etree.ParserError('There is leading text: %r' %
                                        children[0])
            del children[0]
    return children


[docs]def fragment_fromstring(html, create_parent=False,
                        guess_charset=None, parser=None):
    """Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

    If 'create_parent' is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.  In
    this case, leading or trailing text is allowed.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')

    accept_leading_text = bool(create_parent)

    elements = fragments_fromstring(
        html, guess_charset=guess_charset, parser=parser,
        no_leading_text=not accept_leading_text)

    if create_parent:
        if not isinstance(create_parent, _strings):
            create_parent = 'div'
        new_root = Element(create_parent)
        if elements:
            if isinstance(elements[0], _strings):
                new_root.text = elements[0]
                del elements[0]
            new_root.extend(elements)
        return new_root

    if not elements:
        raise etree.ParserError('No elements found')
    if len(elements) > 1:
        raise etree.ParserError('Multiple elements found')
    result = elements[0]
    if result.tail and result.tail.strip():
        raise etree.ParserError('Element followed by text: %r' % result.tail)
    result.tail = None
    return result


[docs]def fromstring(html, guess_charset=None, parser=None):
    """Parse the html, returning a single element/document.

    This tries to minimally parse the chunk of text, without knowing if it
    is a fragment or a document.

    'base_url' will set the document's base_url attribute (and the tree's
    docinfo.URL)

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')
    doc = document_fromstring(html, parser=parser,
                              guess_charset=guess_charset)

    # document starts with doctype or <html>, full document!
    start = html[:50]
    if isinstance(start, bytes):
        # Allow text comparison in python3.
        # Decode as ascii, that also covers latin-1 and utf-8 for the
        # characters we need.
        start = start.decode('ascii', 'replace')

    start = start.lstrip().lower()
    if start.startswith('<html') or start.startswith('<!doctype'):
        return doc

    head = _find_tag(doc, 'head')

    # if the head is not empty we have a full document
    if len(head):
        return doc

    body = _find_tag(doc, 'body')

    # The body has just one element, so it was probably a single
    # element passed in
    if (len(body) == 1 and (not body.text or not body.text.strip())
        and (not body[-1].tail or not body[-1].tail.strip())):
        return body[0]

    # Now we have a body which represents a bunch of tags which have the
    # content that was passed in.  We will create a fake container, which
    # is the body tag, except <body> implies too much structure.
    if _contains_block_level_tag(body):
        body.tag = 'div'
    else:
        body.tag = 'span'
    return body


[docs]def parse(filename_url_or_file, guess_charset=None, parser=None):
    """Parse a filename, URL, or file-like object into an HTML document
    tree.  Note: this returns a tree, not an element.  Use
    ``parse(...).getroot()`` to get the document root.

    If ``guess_charset`` is true, the ``useChardet`` option is passed into
    html5lib to enable character detection.  This option is on by default
    when parsing from URLs, off by default when parsing from file(-like)
    objects (which tend to return Unicode more often than not), and on by
    default when parsing from a file path (which is read in binary mode).
    """
    if parser is None:
        parser = html_parser
    if not isinstance(filename_url_or_file, _strings):
        fp = filename_url_or_file
        if guess_charset is None:
            # assume that file-like objects return Unicode more often than bytes
            guess_charset = False
    elif _looks_like_url(filename_url_or_file):
        fp = urlopen(filename_url_or_file)
        if guess_charset is None:
            # assume that URLs return bytes
            guess_charset = True
    else:
        fp = open(filename_url_or_file, 'rb')
        if guess_charset is None:
            guess_charset = True

    options = {}
    # html5lib does not accept useChardet as an argument, if it
    # detected the html argument would produce unicode objects.
    if guess_charset:
        options['useChardet'] = guess_charset
    return parser.parse(fp, **options)


[docs]def _looks_like_url(str):
    scheme = urlparse(str)[0]
    if not scheme:
        return False
    elif (sys.platform == 'win32' and
            scheme in string.ascii_letters
            and len(scheme) == 1):
        # looks like a 'normal' absolute path
        return False
    else:
        return True


html_parser = HTMLParser()