1 """
2 An interface to html5lib that mimics the lxml.html interface.
3 """
4
5 from html5lib import HTMLParser as _HTMLParser
6 from html5lib.treebuilders.etree_lxml import TreeBuilder
7
8 from lxml import etree
9 from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element
10
11
12 try:
13 _strings = basestring
14 except NameError:
15 _strings = (bytes, str)
16 try:
17 from urllib2 import urlopen
18 except ImportError:
19 from urllib.request import urlopen
20 try:
21 from urlparse import urlparse
22 except ImportError:
23 from urllib.parse import urlparse
24
26 """An html5lib HTML parser with lxml as tree."""
27
28 - def __init__(self, strict=False, **kwargs):
30
31
32 try:
33 from html5lib import XHTMLParser as _XHTMLParser
34 except ImportError:
35 pass
36 else:
38 """An html5lib XHTML Parser with lxml as tree."""
39
40 - def __init__(self, strict=False, **kwargs):
42
43 xhtml_parser = XHTMLParser()
44
45
51
52
62
63
66 """Parses several HTML elements, returning a list of elements.
67
68 The first item in the list may be a string. If no_leading_text is true,
69 then it will be an error if there is leading text, and it will always be
70 a list of only elements.
71
72 If `guess_charset` is `True` and the text was not unicode but a
73 bytestring, the `chardet` library will perform charset guessing on the
74 string.
75 """
76 if not isinstance(html, _strings):
77 raise TypeError('string required')
78
79 if parser is None:
80 parser = html_parser
81
82 children = parser.parseFragment(html, 'div', useChardet=guess_charset)
83 if children and isinstance(children[0], _strings):
84 if no_leading_text:
85 if children[0].strip():
86 raise etree.ParserError('There is leading text: %r' %
87 children[0])
88 del children[0]
89 return children
90
91
94 """Parses a single HTML element; it is an error if there is more than
95 one element, or if anything but whitespace precedes or follows the
96 element.
97
98 If create_parent is true (or is a tag name) then a parent node
99 will be created to encapsulate the HTML in a single element. In
100 this case, leading or trailing text is allowed.
101 """
102 if not isinstance(html, _strings):
103 raise TypeError('string required')
104
105 accept_leading_text = bool(create_parent)
106
107 elements = fragments_fromstring(
108 html, guess_charset=guess_charset, parser=parser,
109 no_leading_text=not accept_leading_text)
110
111 if create_parent:
112 if not isinstance(create_parent, _strings):
113 create_parent = 'div'
114 new_root = Element(create_parent)
115 if elements:
116 if isinstance(elements[0], _strings):
117 new_root.text = elements[0]
118 del elements[0]
119 new_root.extend(elements)
120 return new_root
121
122 if not elements:
123 raise etree.ParserError('No elements found')
124 if len(elements) > 1:
125 raise etree.ParserError('Multiple elements found')
126 result = elements[0]
127 if result.tail and result.tail.strip():
128 raise etree.ParserError('Element followed by text: %r' % result.tail)
129 result.tail = None
130 return result
131
132
133 -def fromstring(html, guess_charset=True, parser=None):
134 """Parse the html, returning a single element/document.
135
136 This tries to minimally parse the chunk of text, without knowing if it
137 is a fragment or a document.
138
139 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
140 """
141 if not isinstance(html, _strings):
142 raise TypeError('string required')
143 doc = document_fromstring(html, parser=parser,
144 guess_charset=guess_charset)
145
146
147 start = html[:50].lstrip().lower()
148 if start.startswith('<html') or start.startswith('<!doctype'):
149 return doc
150
151 head = _find_tag(doc, 'head')
152
153
154 if len(head):
155 return doc
156
157 body = _find_tag(doc, 'body')
158
159
160
161 if (len(body) == 1 and (not body.text or not body.text.strip())
162 and (not body[-1].tail or not body[-1].tail.strip())):
163 return body[0]
164
165
166
167
168 if _contains_block_level_tag(body):
169 body.tag = 'div'
170 else:
171 body.tag = 'span'
172 return body
173
174
175 -def parse(filename_url_or_file, guess_charset=True, parser=None):
176 """Parse a filename, URL, or file-like object into an HTML document
177 tree. Note: this returns a tree, not an element. Use
178 ``parse(...).getroot()`` to get the document root.
179 """
180 if parser is None:
181 parser = html_parser
182 if not isinstance(filename_url_or_file, _strings):
183 fp = filename_url_or_file
184 elif _looks_like_url(filename_url_or_file):
185 fp = urlopen(filename_url_or_file)
186 else:
187 fp = open(filename_url_or_file, 'rb')
188 return parser.parse(fp, useChardet=guess_charset)
189
191 scheme = urlparse(str)[0]
192 return scheme != ''
193
194 html_parser = HTMLParser()
195