Package lxml :: Package html :: Module soupparser
[frames] | no frames]

Source Code for Module lxml.html.soupparser

  1  __doc__ = """External interface to the BeautifulSoup HTML parser. 
  2  """ 
  3   
  4  __all__ = ["fromstring", "parse", "convert_tree"] 
  5   
  6  from lxml import etree, html 
  7  from BeautifulSoup import \ 
  8       BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString 
  9   
 10   
11 -def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
12 """Parse a string of HTML data into an Element tree using the 13 BeautifulSoup parser. 14 15 Returns the root ``<html>`` Element of the tree. 16 17 You can pass a different BeautifulSoup parser through the 18 `beautifulsoup` keyword, and a diffent Element factory function 19 through the `makeelement` keyword. By default, the standard 20 ``BeautifulSoup`` class and the default factory of `lxml.html` are 21 used. 22 """ 23 return _parse(data, beautifulsoup, makeelement, **bsargs)
24
25 -def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
26 """Parse a file into an ElemenTree using the BeautifulSoup parser. 27 28 You can pass a different BeautifulSoup parser through the 29 `beautifulsoup` keyword, and a diffent Element factory function 30 through the `makeelement` keyword. By default, the standard 31 ``BeautifulSoup`` class and the default factory of `lxml.html` are 32 used. 33 """ 34 if not hasattr(file, 'read'): 35 file = open(file) 36 root = _parse(file, beautifulsoup, makeelement, **bsargs) 37 return etree.ElementTree(root)
38
39 -def convert_tree(beautiful_soup_tree, makeelement=None):
40 """Convert a BeautifulSoup tree to a list of Element trees. 41 42 Returns a list instead of a single root Element to support 43 HTML-like soup with more than one root element. 44 45 You can pass a different Element factory through the `makeelement` 46 keyword. 47 """ 48 if makeelement is None: 49 makeelement = html.html_parser.makeelement 50 root = _convert_tree(beautiful_soup_tree, makeelement) 51 children = root.getchildren() 52 for child in children: 53 root.remove(child) 54 return children
55 56 57 # helpers 58
59 -def _parse(source, beautifulsoup, makeelement, **bsargs):
60 if beautifulsoup is None: 61 beautifulsoup = BeautifulSoup 62 if makeelement is None: 63 makeelement = html.html_parser.makeelement 64 if 'convertEntities' not in bsargs: 65 bsargs['convertEntities'] = 'html' 66 tree = beautifulsoup(source, **bsargs) 67 root = _convert_tree(tree, makeelement) 68 # from ET: wrap the document in a html root element, if necessary 69 if len(root) == 1 and root[0].tag == "html": 70 return root[0] 71 root.tag = "html" 72 return root
73
74 -def _convert_tree(beautiful_soup_tree, makeelement):
75 root = makeelement(beautiful_soup_tree.name, 76 attrib=dict(beautiful_soup_tree.attrs)) 77 _convert_children(root, beautiful_soup_tree, makeelement) 78 return root
79
80 -def _convert_children(parent, beautiful_soup_tree, makeelement):
81 SubElement = etree.SubElement 82 et_child = None 83 for child in beautiful_soup_tree: 84 if isinstance(child, Tag): 85 et_child = SubElement(parent, child.name, attrib=dict( 86 [(k, unescape(v)) for (k,v) in child.attrs])) 87 _convert_children(et_child, child, makeelement) 88 elif type(child) is NavigableString: 89 _append_text(parent, et_child, unescape(child)) 90 else: 91 if isinstance(child, Comment): 92 parent.append(etree.Comment(child)) 93 elif isinstance(child, ProcessingInstruction): 94 parent.append(etree.ProcessingInstruction( 95 *child.split(' ', 1))) 96 else: # CData 97 _append_text(parent, et_child, unescape(child))
98
99 -def _append_text(parent, element, text):
100 if element is None: 101 parent.text = (parent.text or '') + text 102 else: 103 element.tail = (element.tail or '') + text
104 105 106 # copied from ET's ElementSoup 107 108 from htmlentitydefs import name2codepoint 109 import re 110 111 handle_entities = re.compile("&(\w+);").sub 112
113 -def unescape(string):
114 # work around oddities in BeautifulSoup's entity handling 115 def unescape_entity(m): 116 try: 117 return unichr(name2codepoint[m.group(1)]) 118 except KeyError: 119 return m.group(0) # use as is
120 return handle_entities(unescape_entity, string) 121