1 __doc__ = """External interface to the BeautifulSoup HTML parser.
2 """
3
4 __all__ = ["fromstring", "parse", "convert_tree"]
5
6 from lxml import etree, html
7 from BeautifulSoup import \
8 BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
9
10
11 -def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
12 """Parse a string of HTML data into an Element tree using the
13 BeautifulSoup parser.
14
15 Returns the root ``<html>`` Element of the tree.
16
17 You can pass a different BeautifulSoup parser through the
18 `beautifulsoup` keyword, and a diffent Element factory function
19 through the `makeelement` keyword. By default, the standard
20 ``BeautifulSoup`` class and the default factory of `lxml.html` are
21 used.
22 """
23 return _parse(data, beautifulsoup, makeelement, **bsargs)
24
25 -def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
26 """Parse a file into an ElemenTree using the BeautifulSoup parser.
27
28 You can pass a different BeautifulSoup parser through the
29 `beautifulsoup` keyword, and a diffent Element factory function
30 through the `makeelement` keyword. By default, the standard
31 ``BeautifulSoup`` class and the default factory of `lxml.html` are
32 used.
33 """
34 if not hasattr(file, 'read'):
35 file = open(file)
36 root = _parse(file, beautifulsoup, makeelement, **bsargs)
37 return etree.ElementTree(root)
38
55
56
57
58
59 -def _parse(source, beautifulsoup, makeelement, **bsargs):
73
79
81 SubElement = etree.SubElement
82 et_child = None
83 for child in beautiful_soup_tree:
84 if isinstance(child, Tag):
85 et_child = SubElement(parent, child.name, attrib=dict(
86 [(k, unescape(v)) for (k,v) in child.attrs]))
87 _convert_children(et_child, child, makeelement)
88 elif type(child) is NavigableString:
89 _append_text(parent, et_child, unescape(child))
90 else:
91 if isinstance(child, Comment):
92 parent.append(etree.Comment(child))
93 elif isinstance(child, ProcessingInstruction):
94 parent.append(etree.ProcessingInstruction(
95 *child.split(' ', 1)))
96 else:
97 _append_text(parent, et_child, unescape(child))
98
99 -def _append_text(parent, element, text):
100 if element is None:
101 parent.text = (parent.text or '') + text
102 else:
103 element.tail = (element.tail or '') + text
104
105
106
107
108 from htmlentitydefs import name2codepoint
109 import re
110
111 handle_entities = re.compile("&(\w+);").sub
112
114
115 def unescape_entity(m):
116 try:
117 return unichr(name2codepoint[m.group(1)])
118 except KeyError:
119 return m.group(0)
120 return handle_entities(unescape_entity, string)
121