Package lxml :: Package html :: Module ElementSoup
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.ElementSoup

 1  __doc__ = """External interface to the BeautifulSoup HTML parser. 
 2  """ 
 3   
 4  __all__ = ["parse", "convert_tree"] 
 5   
 6  from lxml import etree, html 
 7  from BeautifulSoup import \ 
 8       BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString 
 9   
10   
11 -def parse(file, beautifulsoup=None, makeelement=None):
12 if beautifulsoup is None: 13 beautifulsoup = BeautifulSoup 14 if makeelement is None: 15 makeelement = html.html_parser.makeelement 16 if not hasattr(file, 'read'): 17 file = open(file) 18 tree = beautifulsoup(file) 19 root = _convert_tree(tree, makeelement) 20 # from ET: wrap the document in a html root element, if necessary 21 if len(root) == 1 and root[0].tag == "html": 22 return root[0] 23 root.tag = "html" 24 return root
25
26 -def convert_tree(beautiful_soup_tree, makeelement=None):
27 if makeelement is None: 28 makeelement = html.html_parser.makeelement 29 root = _convert_tree(beautiful_soup_tree, makeelement) 30 children = root.getchildren() 31 for child in children: 32 root.remove(child) 33 return children
34 35 36 # helpers 37
38 -def _convert_tree(beautiful_soup_tree, makeelement):
39 root = makeelement(beautiful_soup_tree.name, 40 attrib=dict(beautiful_soup_tree.attrs)) 41 _convert_children(root, beautiful_soup_tree, makeelement) 42 return root
43
44 -def _convert_children(parent, beautiful_soup_tree, makeelement):
45 SubElement = etree.SubElement 46 et_child = None 47 for child in beautiful_soup_tree: 48 if isinstance(child, Tag): 49 et_child = SubElement(parent, child.name, attrib=dict( 50 [(k, unescape(v)) for (k,v) in child.attrs])) 51 _convert_children(et_child, child, makeelement) 52 elif type(child) is NavigableString: 53 _append_text(parent, et_child, unescape(unicode(child))) 54 else: 55 if isinstance(child, Comment): 56 parent.append(etree.Comment(child.string)) 57 elif isinstance(child, ProcessingInstruction): 58 parent.append(etree.ProcessingInstruction( 59 *child.string.split(' ', 1))) 60 else: # CData 61 _append_text(parent, et_child, unescape(unicode(child)))
62
63 -def _append_text(parent, element, text):
64 if element is None: 65 parent.text = (parent.text or '') + text 66 else: 67 element.tail = (element.tail or '') + text
68 69 70 # copied from ET's ElementSoup 71 72 import htmlentitydefs, re 73 74 handle_entities = re.compile("&(\w+);").sub 75 76 try: 77 name2codepoint = htmlentitydefs.name2codepoint 78 except AttributeError: 79 # Emulate name2codepoint for Python 2.2 and earlier 80 name2codepoint = {} 81 for name, entity in htmlentitydefs.entitydefs.items(): 82 if len(entity) == 1: 83 name2codepoint[name] = ord(entity) 84 else: 85 name2codepoint[name] = int(entity[2:-1]) 86
87 -def unescape(string):
88 # work around oddities in BeautifulSoup's entity handling 89 def unescape_entity(m): 90 try: 91 return unichr(name2codepoint[m.group(1)]) 92 except KeyError: 93 return m.group(0) # use as is
94 return handle_entities(unescape_entity, string) 95