1 """
2 An interface to html5lib that mimics the lxml.html interface.
3 """
4
5 import urllib
6
7 from html5lib import HTMLParser as _HTMLParser
8 from html5lib.treebuilders.etree_lxml import TreeBuilder
9
10 from lxml import etree
11 from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE
12
13
14 try:
15 _strings = basestring
16 except NameError:
17 _strings = (bytes, str)
18
19
21 """An html5lib HTML parser with lxml as tree."""
22
23 - def __init__(self, strict=False, **kwargs):
25
26
27 try:
28 from html5lib import XHTMLParser as _XHTMLParser
29 except ImportError:
30 pass
31 else:
33 """An html5lib XHTML Parser with lxml as tree."""
34
35 - def __init__(self, strict=False, **kwargs):
37
38 xhtml_parser = XHTMLParser()
39
40
46
47
57
58
61 """Parses several HTML elements, returning a list of elements.
62
63 The first item in the list may be a string. If no_leading_text is true,
64 then it will be an error if there is leading text, and it will always be
65 a list of only elements.
66
67 If `guess_charset` is `True` and the text was not unicode but a
68 bytestring, the `chardet` library will perform charset guessing on the
69 string.
70 """
71 if not isinstance(html, _strings):
72 raise TypeError('string required')
73
74 if parser is None:
75 parser = html_parser
76
77 children = parser.parseFragment(html, 'div', useChardet=guess_charset)
78 if children and isinstance(children[0], _strings):
79 if no_leading_text:
80 if children[0].strip():
81 raise etree.ParserError('There is leading text: %r' %
82 children[0])
83 del children[0]
84 return children
85
86
89 """Parses a single HTML element; it is an error if there is more than
90 one element, or if anything but whitespace precedes or follows the
91 element.
92
93 If create_parent is true (or is a tag name) then a parent node
94 will be created to encapsulate the HTML in a single element. In
95 this case, leading or trailing text is allowed.
96 """
97 if not isinstance(html, _strings):
98 raise TypeError('string required')
99
100 accept_leading_text = bool(create_parent)
101
102 elements = fragments_fromstring(
103 html, guess_charset=guess_charset, parser=parser,
104 no_leading_text=not accept_leading_text, **kw)
105
106 if create_parent:
107 if not isinstance(create_parent, basestring):
108 create_parent = 'div'
109 new_root = Element(create_parent)
110 if elements:
111 if isinstance(elements[0], basestring):
112 new_root.text = elements[0]
113 del elements[0]
114 new_root.extend(elements)
115 return new_root
116
117 if not elements:
118 raise etree.ParserError('No elements found')
119 if len(elements) > 1:
120 raise etree.ParserError('Multiple elements found')
121 result = elements[0]
122 if result.tail and result.tail.strip():
123 raise etree.ParserError('Element followed by text: %r' % result.tail)
124 result.tail = None
125 return result
126
127
128 -def fromstring(html, guess_charset=True, parser=None):
129 """Parse the html, returning a single element/document.
130
131 This tries to minimally parse the chunk of text, without knowing if it
132 is a fragment or a document.
133
134 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
135 """
136 if not isinstance(html, _strings):
137 raise TypeError('string required')
138 doc = document_fromstring(html, parser=parser,
139 guess_charset=guess_charset)
140
141
142 start = html[:50].lstrip().lower()
143 if start.startswith('<html') or start.startswith('<!doctype'):
144 return doc
145
146 head = _find_tag(doc, 'head')
147
148
149 if len(head):
150 return doc
151
152 body = _find_tag(doc, 'body')
153
154
155
156 if (len(body) == 1 and (not body.text or not body.text.strip())
157 and (not body[-1].tail or not body[-1].tail.strip())):
158 return body[0]
159
160
161
162
163 if _contains_block_level_tag(body):
164 body.tag = 'div'
165 else:
166 body.tag = 'span'
167 return body
168
169
170 -def parse(filename_url_or_file, guess_charset=True, parser=None):
171 """Parse a filename, URL, or file-like object into an HTML document
172 tree. Note: this returns a tree, not an element. Use
173 ``parse(...).getroot()`` to get the document root.
174 """
175 if parser is None:
176 parser = html_parser
177 if isinstance(filename_url_or_file, basestring):
178 fp = urllib.urlopen(filename_url_or_file)
179 else:
180 fp = filename_url_or_file
181 return parser.parse(fp, useChardet=guess_charset)
182
183
184 html_parser = HTMLParser()
185