1 """
2 An interface to html5lib that mimics the lxml.html interface.
3 """
4 import sys
5 import string
6
7 from html5lib import HTMLParser as _HTMLParser
8 from html5lib.treebuilders.etree_lxml import TreeBuilder
9 from lxml import etree
10 from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
11
12
13 try:
14 _strings = basestring
15 except NameError:
16 _strings = (bytes, str)
17 try:
18 from urllib2 import urlopen
19 except ImportError:
20 from urllib.request import urlopen
21 try:
22 from urlparse import urlparse
23 except ImportError:
24 from urllib.parse import urlparse
25
26
28 """An html5lib HTML parser with lxml as tree."""
29
30 - def __init__(self, strict=False, **kwargs):
32
33
34 try:
35 from html5lib import XHTMLParser as _XHTMLParser
36 except ImportError:
37 pass
38 else:
40 """An html5lib XHTML Parser with lxml as tree."""
41
42 - def __init__(self, strict=False, **kwargs):
44
45 xhtml_parser = XHTMLParser()
46
47
53
54
56 """
57 Parse a whole document into a string.
58
59 If `guess_charset` is true, or if the input is not Unicode but a
60 byte string, the `chardet` library will perform charset guessing
61 on the string.
62 """
63 if not isinstance(html, _strings):
64 raise TypeError('string required')
65
66 if parser is None:
67 parser = html_parser
68
69 options = {}
70 if guess_charset is None and isinstance(html, bytes):
71
72
73 guess_charset = True
74 if guess_charset is not None:
75 options['useChardet'] = guess_charset
76 return parser.parse(html, **options).getroot()
77
78
81 """Parses several HTML elements, returning a list of elements.
82
83 The first item in the list may be a string. If no_leading_text is true,
84 then it will be an error if there is leading text, and it will always be
85 a list of only elements.
86
87 If `guess_charset` is true, the `chardet` library will perform charset
88 guessing on the string.
89 """
90 if not isinstance(html, _strings):
91 raise TypeError('string required')
92
93 if parser is None:
94 parser = html_parser
95
96 options = {}
97 if guess_charset is None and isinstance(html, bytes):
98
99
100 guess_charset = False
101 if guess_charset is not None:
102 options['useChardet'] = guess_charset
103 children = parser.parseFragment(html, 'div', **options)
104 if children and isinstance(children[0], _strings):
105 if no_leading_text:
106 if children[0].strip():
107 raise etree.ParserError('There is leading text: %r' %
108 children[0])
109 del children[0]
110 return children
111
112
115 """Parses a single HTML element; it is an error if there is more than
116 one element, or if anything but whitespace precedes or follows the
117 element.
118
119 If 'create_parent' is true (or is a tag name) then a parent node
120 will be created to encapsulate the HTML in a single element. In
121 this case, leading or trailing text is allowed.
122
123 If `guess_charset` is true, the `chardet` library will perform charset
124 guessing on the string.
125 """
126 if not isinstance(html, _strings):
127 raise TypeError('string required')
128
129 accept_leading_text = bool(create_parent)
130
131 elements = fragments_fromstring(
132 html, guess_charset=guess_charset, parser=parser,
133 no_leading_text=not accept_leading_text)
134
135 if create_parent:
136 if not isinstance(create_parent, _strings):
137 create_parent = 'div'
138 new_root = Element(create_parent)
139 if elements:
140 if isinstance(elements[0], _strings):
141 new_root.text = elements[0]
142 del elements[0]
143 new_root.extend(elements)
144 return new_root
145
146 if not elements:
147 raise etree.ParserError('No elements found')
148 if len(elements) > 1:
149 raise etree.ParserError('Multiple elements found')
150 result = elements[0]
151 if result.tail and result.tail.strip():
152 raise etree.ParserError('Element followed by text: %r' % result.tail)
153 result.tail = None
154 return result
155
156
157 -def fromstring(html, guess_charset=None, parser=None):
158 """Parse the html, returning a single element/document.
159
160 This tries to minimally parse the chunk of text, without knowing if it
161 is a fragment or a document.
162
163 'base_url' will set the document's base_url attribute (and the tree's
164 docinfo.URL)
165
166 If `guess_charset` is true, or if the input is not Unicode but a
167 byte string, the `chardet` library will perform charset guessing
168 on the string.
169 """
170 if not isinstance(html, _strings):
171 raise TypeError('string required')
172 doc = document_fromstring(html, parser=parser,
173 guess_charset=guess_charset)
174
175
176 start = html[:50]
177 if isinstance(start, bytes):
178
179
180
181 start = start.decode('ascii', 'replace')
182
183 start = start.lstrip().lower()
184 if start.startswith('<html') or start.startswith('<!doctype'):
185 return doc
186
187 head = _find_tag(doc, 'head')
188
189
190 if len(head):
191 return doc
192
193 body = _find_tag(doc, 'body')
194
195
196
197 if (len(body) == 1 and (not body.text or not body.text.strip())
198 and (not body[-1].tail or not body[-1].tail.strip())):
199 return body[0]
200
201
202
203
204 if _contains_block_level_tag(body):
205 body.tag = 'div'
206 else:
207 body.tag = 'span'
208 return body
209
210
211 -def parse(filename_url_or_file, guess_charset=None, parser=None):
212 """Parse a filename, URL, or file-like object into an HTML document
213 tree. Note: this returns a tree, not an element. Use
214 ``parse(...).getroot()`` to get the document root.
215
216 If ``guess_charset`` is true, the ``useChardet`` option is passed into
217 html5lib to enable character detection. This option is on by default
218 when parsing from URLs, off by default when parsing from file(-like)
219 objects (which tend to return Unicode more often than not), and on by
220 default when parsing from a file path (which is read in binary mode).
221 """
222 if parser is None:
223 parser = html_parser
224 if not isinstance(filename_url_or_file, _strings):
225 fp = filename_url_or_file
226 if guess_charset is None:
227
228 guess_charset = False
229 elif _looks_like_url(filename_url_or_file):
230 fp = urlopen(filename_url_or_file)
231 if guess_charset is None:
232
233 guess_charset = True
234 else:
235 fp = open(filename_url_or_file, 'rb')
236 if guess_charset is None:
237 guess_charset = True
238
239 options = {}
240
241
242 if guess_charset:
243 options['useChardet'] = guess_charset
244 return parser.parse(fp, **options)
245
246
248 scheme = urlparse(str)[0]
249 if not scheme:
250 return False
251 elif (sys.platform == 'win32' and
252 scheme in string.ascii_letters
253 and len(scheme) == 1):
254
255 return False
256 else:
257 return True
258
259
260 html_parser = HTMLParser()
261