1 """
2 An interface to html5lib that mimics the lxml.html interface.
3 """
4
5 import sys
6 import string
7
8 from html5lib import HTMLParser as _HTMLParser
9 from html5lib.treebuilders.etree_lxml import TreeBuilder
10
11 from lxml import etree
12 from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element
13
14
15 try:
16 _strings = basestring
17 except NameError:
18 _strings = (bytes, str)
19 try:
20 from urllib2 import urlopen
21 except ImportError:
22 from urllib.request import urlopen
23 try:
24 from urlparse import urlparse
25 except ImportError:
26 from urllib.parse import urlparse
27
29 """An html5lib HTML parser with lxml as tree."""
30
31 - def __init__(self, strict=False, **kwargs):
33
34
35 try:
36 from html5lib import XHTMLParser as _XHTMLParser
37 except ImportError:
38 pass
39 else:
41 """An html5lib XHTML Parser with lxml as tree."""
42
43 - def __init__(self, strict=False, **kwargs):
45
46 xhtml_parser = XHTMLParser()
47
48
54
55
65
66
69 """Parses several HTML elements, returning a list of elements.
70
71 The first item in the list may be a string. If no_leading_text is true,
72 then it will be an error if there is leading text, and it will always be
73 a list of only elements.
74
75 If `guess_charset` is `True` and the text was not unicode but a
76 bytestring, the `chardet` library will perform charset guessing on the
77 string.
78 """
79 if not isinstance(html, _strings):
80 raise TypeError('string required')
81
82 if parser is None:
83 parser = html_parser
84
85 children = parser.parseFragment(html, 'div', useChardet=guess_charset)
86 if children and isinstance(children[0], _strings):
87 if no_leading_text:
88 if children[0].strip():
89 raise etree.ParserError('There is leading text: %r' %
90 children[0])
91 del children[0]
92 return children
93
94
97 """Parses a single HTML element; it is an error if there is more than
98 one element, or if anything but whitespace precedes or follows the
99 element.
100
101 If create_parent is true (or is a tag name) then a parent node
102 will be created to encapsulate the HTML in a single element. In
103 this case, leading or trailing text is allowed.
104 """
105 if not isinstance(html, _strings):
106 raise TypeError('string required')
107
108 accept_leading_text = bool(create_parent)
109
110 elements = fragments_fromstring(
111 html, guess_charset=guess_charset, parser=parser,
112 no_leading_text=not accept_leading_text)
113
114 if create_parent:
115 if not isinstance(create_parent, _strings):
116 create_parent = 'div'
117 new_root = Element(create_parent)
118 if elements:
119 if isinstance(elements[0], _strings):
120 new_root.text = elements[0]
121 del elements[0]
122 new_root.extend(elements)
123 return new_root
124
125 if not elements:
126 raise etree.ParserError('No elements found')
127 if len(elements) > 1:
128 raise etree.ParserError('Multiple elements found')
129 result = elements[0]
130 if result.tail and result.tail.strip():
131 raise etree.ParserError('Element followed by text: %r' % result.tail)
132 result.tail = None
133 return result
134
135
136 -def fromstring(html, guess_charset=True, parser=None):
137 """Parse the html, returning a single element/document.
138
139 This tries to minimally parse the chunk of text, without knowing if it
140 is a fragment or a document.
141
142 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
143 """
144 if not isinstance(html, _strings):
145 raise TypeError('string required')
146 doc = document_fromstring(html, parser=parser,
147 guess_charset=guess_charset)
148
149
150 start = html[:50]
151 if isinstance(start, bytes):
152
153
154
155 start = start.decode('ascii', 'replace')
156
157 start = start.lstrip().lower()
158 if start.startswith('<html') or start.startswith('<!doctype'):
159 return doc
160
161 head = _find_tag(doc, 'head')
162
163
164 if len(head):
165 return doc
166
167 body = _find_tag(doc, 'body')
168
169
170
171 if (len(body) == 1 and (not body.text or not body.text.strip())
172 and (not body[-1].tail or not body[-1].tail.strip())):
173 return body[0]
174
175
176
177
178 if _contains_block_level_tag(body):
179 body.tag = 'div'
180 else:
181 body.tag = 'span'
182 return body
183
184
185 -def parse(filename_url_or_file, guess_charset=True, parser=None):
186 """Parse a filename, URL, or file-like object into an HTML document
187 tree. Note: this returns a tree, not an element. Use
188 ``parse(...).getroot()`` to get the document root.
189 """
190 if parser is None:
191 parser = html_parser
192 if not isinstance(filename_url_or_file, _strings):
193 fp = filename_url_or_file
194 elif _looks_like_url(filename_url_or_file):
195 fp = urlopen(filename_url_or_file)
196 else:
197 fp = open(filename_url_or_file, 'rb')
198 return parser.parse(fp, useChardet=guess_charset)
199
200
202 scheme = urlparse(str)[0]
203 if not scheme:
204 return False
205 elif (sys.platform == 'win32' and
206 scheme in string.ascii_letters
207 and len(scheme) == 1):
208
209 return False
210 else:
211 return True
212
213
214 html_parser = HTMLParser()
215