1 __doc__ = """External interface to the BeautifulSoup HTML parser.
2 """
3
4 __all__ = ["parse", "convert_tree"]
5
6 from lxml import etree, html
7 from BeautifulSoup import \
8 BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
9
10
11 -def parse(file, beautifulsoup=None, makeelement=None):
25
34
35
36
37
43
45 SubElement = etree.SubElement
46 et_child = None
47 for child in beautiful_soup_tree:
48 if isinstance(child, Tag):
49 et_child = SubElement(parent, child.name, attrib=dict(
50 [(k, unescape(v)) for (k,v) in child.attrs]))
51 _convert_children(et_child, child, makeelement)
52 elif type(child) is NavigableString:
53 _append_text(parent, et_child, unescape(unicode(child)))
54 else:
55 if isinstance(child, Comment):
56 parent.append(etree.Comment(child.string))
57 elif isinstance(child, ProcessingInstruction):
58 parent.append(etree.ProcessingInstruction(
59 *child.string.split(' ', 1)))
60 else:
61 _append_text(parent, et_child, unescape(unicode(child)))
62
63 -def _append_text(parent, element, text):
64 if element is None:
65 parent.text = (parent.text or '') + text
66 else:
67 element.tail = (element.tail or '') + text
68
69
70
71
72 import htmlentitydefs, re
73
74 handle_entities = re.compile("&(\w+);").sub
75
76 try:
77 name2codepoint = htmlentitydefs.name2codepoint
78 except AttributeError:
79
80 name2codepoint = {}
81 for name, entity in htmlentitydefs.entitydefs.items():
82 if len(entity) == 1:
83 name2codepoint[name] = ord(entity)
84 else:
85 name2codepoint[name] = int(entity[2:-1])
86
88
89 def unescape_entity(m):
90 try:
91 return unichr(name2codepoint[m.group(1)])
92 except KeyError:
93 return m.group(0)
94 return handle_entities(unescape_entity, string)
95