1 """External interface to the BeautifulSoup HTML parser.
2 """
3
4 __all__ = ["fromstring", "parse", "convert_tree"]
5
6 import re
7 from lxml import etree, html
8
9 try:
10 from bs4 import (
11 BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
12 Declaration, Doctype)
13 _DECLARATION_OR_DOCTYPE = (Declaration, Doctype)
14 except ImportError:
15 from BeautifulSoup import (
16 BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
17 Declaration)
18 _DECLARATION_OR_DOCTYPE = Declaration
19
20
21 -def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
22 """Parse a string of HTML data into an Element tree using the
23 BeautifulSoup parser.
24
25 Returns the root ``<html>`` Element of the tree.
26
27 You can pass a different BeautifulSoup parser through the
28 `beautifulsoup` keyword, and a diffent Element factory function
29 through the `makeelement` keyword. By default, the standard
30 ``BeautifulSoup`` class and the default factory of `lxml.html` are
31 used.
32 """
33 return _parse(data, beautifulsoup, makeelement, **bsargs)
34
35
36 -def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
37 """Parse a file into an ElemenTree using the BeautifulSoup parser.
38
39 You can pass a different BeautifulSoup parser through the
40 `beautifulsoup` keyword, and a diffent Element factory function
41 through the `makeelement` keyword. By default, the standard
42 ``BeautifulSoup`` class and the default factory of `lxml.html` are
43 used.
44 """
45 if not hasattr(file, 'read'):
46 file = open(file)
47 root = _parse(file, beautifulsoup, makeelement, **bsargs)
48 return etree.ElementTree(root)
49
52 """Convert a BeautifulSoup tree to a list of Element trees.
53
54 Returns a list instead of a single root Element to support
55 HTML-like soup with more than one root element.
56
57 You can pass a different Element factory through the `makeelement`
58 keyword.
59 """
60 root = _convert_tree(beautiful_soup_tree, makeelement)
61 children = root.getchildren()
62 for child in children:
63 root.remove(child)
64 return children
65
66
67
68
69 -def _parse(source, beautifulsoup, makeelement, **bsargs):
70 if beautifulsoup is None:
71 beautifulsoup = BeautifulSoup
72 if hasattr(beautifulsoup, "HTML_ENTITIES"):
73 if 'convertEntities' not in bsargs:
74 bsargs['convertEntities'] = 'html'
75 if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"):
76 if 'features' not in bsargs:
77 bsargs['features'] = ['html.parser']
78 tree = beautifulsoup(source, **bsargs)
79 root = _convert_tree(tree, makeelement)
80
81 if len(root) == 1 and root[0].tag == "html":
82 return root[0]
83 root.tag = "html"
84 return root
85
86
87 _parse_doctype_declaration = re.compile(
88 r'(?:\s|[<!])*DOCTYPE\s*HTML'
89 r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?'
90 r'(?:\s+(\'[^\']*\'|"[^"]*"))?',
91 re.IGNORECASE).match
95
97 self.name = 'html'
98 self.attrs = []
99 self.contents = contents
100
103
106 if makeelement is None:
107 makeelement = html.html_parser.makeelement
108
109
110
111
112
113
114
115 first_element_idx = last_element_idx = None
116 html_root = declaration = None
117 for i, e in enumerate(beautiful_soup_tree):
118 if isinstance(e, Tag):
119 if first_element_idx is None:
120 first_element_idx = i
121 last_element_idx = i
122 if html_root is None and e.name and e.name.lower() == 'html':
123 html_root = e
124 elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE):
125 declaration = e
126
127
128
129
130
131
132 pre_root = beautiful_soup_tree.contents[:first_element_idx]
133 roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
134 post_root = beautiful_soup_tree.contents[last_element_idx+1:]
135
136
137 if html_root is not None:
138
139 i = roots.index(html_root)
140 html_root.contents = roots[:i] + html_root.contents + roots[i+1:]
141 else:
142
143 html_root = _PseudoTag(roots)
144
145 convert_node = _init_node_converters(makeelement)
146
147
148 res_root = convert_node(html_root)
149 prev = res_root
150 for e in reversed(pre_root):
151 converted = convert_node(e)
152 if converted is not None:
153 prev.addprevious(converted)
154 prev = converted
155
156
157 prev = res_root
158 for e in post_root:
159 converted = convert_node(e)
160 if converted is not None:
161 prev.addnext(converted)
162 prev = converted
163
164 if declaration is not None:
165 try:
166
167 doctype_string = declaration.output_ready()
168 except AttributeError:
169 doctype_string = declaration.string
170
171 match = _parse_doctype_declaration(doctype_string)
172 if not match:
173
174
175 pass
176 else:
177 external_id, sys_uri = match.groups()
178 docinfo = res_root.getroottree().docinfo
179
180 docinfo.public_id = external_id and external_id[1:-1]
181 docinfo.system_url = sys_uri and sys_uri[1:-1]
182
183 return res_root
184
187 converters = {}
188 ordered_node_types = []
189
190 def converter(*types):
191 def add(handler):
192 for t in types:
193 converters[t] = handler
194 ordered_node_types.append(t)
195 return handler
196 return add
197
198 def find_best_converter(node):
199 for t in ordered_node_types:
200 if isinstance(node, t):
201 return converters[t]
202 return None
203
204 def convert_node(bs_node, parent=None):
205
206 try:
207 handler = converters[type(bs_node)]
208 except KeyError:
209 handler = converters[type(bs_node)] = find_best_converter(bs_node)
210 if handler is None:
211 return None
212 return handler(bs_node, parent)
213
214 def map_attrs(bs_attrs):
215 if isinstance(bs_attrs, dict):
216 attribs = {}
217 for k, v in bs_attrs.items():
218 if isinstance(v, list):
219 v = " ".join(v)
220 attribs[k] = unescape(v)
221 else:
222 attribs = dict((k, unescape(v)) for k, v in bs_attrs)
223 return attribs
224
225 def append_text(parent, text):
226 if len(parent) == 0:
227 parent.text = (parent.text or '') + text
228 else:
229 parent[-1].tail = (parent[-1].tail or '') + text
230
231
232
233 @converter(Tag, _PseudoTag)
234 def convert_tag(bs_node, parent):
235 attrs = bs_node.attrs
236 if parent is not None:
237 attribs = map_attrs(attrs) if attrs else None
238 res = etree.SubElement(parent, bs_node.name, attrib=attribs)
239 else:
240 attribs = map_attrs(attrs) if attrs else {}
241 res = makeelement(bs_node.name, attrib=attribs)
242
243 for child in bs_node:
244
245 try:
246 handler = converters[type(child)]
247 except KeyError:
248 pass
249 else:
250 if handler is not None:
251 handler(child, res)
252 continue
253 convert_node(child, res)
254 return res
255
256 @converter(Comment)
257 def convert_comment(bs_node, parent):
258 res = etree.Comment(bs_node)
259 if parent is not None:
260 parent.append(res)
261 return res
262
263 @converter(ProcessingInstruction)
264 def convert_pi(bs_node, parent):
265 if bs_node.endswith('?'):
266
267
268 bs_node = bs_node[:-1]
269 res = etree.ProcessingInstruction(*bs_node.split(' ', 1))
270 if parent is not None:
271 parent.append(res)
272 return res
273
274 @converter(NavigableString)
275 def convert_text(bs_node, parent):
276 if parent is not None:
277 append_text(parent, unescape(bs_node))
278 return None
279
280 return convert_node
281
282
283
284
285 try:
286 from html.entities import name2codepoint
287 except ImportError:
288 from htmlentitydefs import name2codepoint
289
290
291 handle_entities = re.compile(r"&(\w+);").sub
292
293
294 try:
295 unichr
296 except NameError:
297
298 unichr = chr
302 if not string:
303 return ''
304
305 def unescape_entity(m):
306 try:
307 return unichr(name2codepoint[m.group(1)])
308 except KeyError:
309 return m.group(0)
310 return handle_entities(unescape_entity, string)
311