Package lxml :: Package html :: Module soupparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.soupparser

  1  """External interface to the BeautifulSoup HTML parser. 
  2  """ 
  3   
  4  __all__ = ["fromstring", "parse", "convert_tree"] 
  5   
  6  import re 
  7  from lxml import etree, html 
  8   
  9  try: 
 10      from bs4 import ( 
 11          BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, 
 12          Declaration, Doctype) 
 13      _DECLARATION_OR_DOCTYPE = (Declaration, Doctype) 
 14  except ImportError: 
 15      from BeautifulSoup import ( 
 16          BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, 
 17          Declaration) 
 18      _DECLARATION_OR_DOCTYPE = Declaration 
19 20 21 -def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
22 """Parse a string of HTML data into an Element tree using the 23 BeautifulSoup parser. 24 25 Returns the root ``<html>`` Element of the tree. 26 27 You can pass a different BeautifulSoup parser through the 28 `beautifulsoup` keyword, and a diffent Element factory function 29 through the `makeelement` keyword. By default, the standard 30 ``BeautifulSoup`` class and the default factory of `lxml.html` are 31 used. 32 """ 33 return _parse(data, beautifulsoup, makeelement, **bsargs)
34
35 36 -def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
37 """Parse a file into an ElemenTree using the BeautifulSoup parser. 38 39 You can pass a different BeautifulSoup parser through the 40 `beautifulsoup` keyword, and a diffent Element factory function 41 through the `makeelement` keyword. By default, the standard 42 ``BeautifulSoup`` class and the default factory of `lxml.html` are 43 used. 44 """ 45 if not hasattr(file, 'read'): 46 file = open(file) 47 root = _parse(file, beautifulsoup, makeelement, **bsargs) 48 return etree.ElementTree(root)
49
50 51 -def convert_tree(beautiful_soup_tree, makeelement=None):
52 """Convert a BeautifulSoup tree to a list of Element trees. 53 54 Returns a list instead of a single root Element to support 55 HTML-like soup with more than one root element. 56 57 You can pass a different Element factory through the `makeelement` 58 keyword. 59 """ 60 root = _convert_tree(beautiful_soup_tree, makeelement) 61 children = root.getchildren() 62 for child in children: 63 root.remove(child) 64 return children
65
66 67 # helpers 68 69 -def _parse(source, beautifulsoup, makeelement, **bsargs):
70 if beautifulsoup is None: 71 beautifulsoup = BeautifulSoup 72 if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3 73 if 'convertEntities' not in bsargs: 74 bsargs['convertEntities'] = 'html' 75 if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4 76 if 'features' not in bsargs: 77 bsargs['features'] = 'html.parser' # use Python html parser 78 tree = beautifulsoup(source, **bsargs) 79 root = _convert_tree(tree, makeelement) 80 # from ET: wrap the document in a html root element, if necessary 81 if len(root) == 1 and root[0].tag == "html": 82 return root[0] 83 root.tag = "html" 84 return root
85 86 87 _parse_doctype_declaration = re.compile( 88 r'(?:\s|[<!])*DOCTYPE\s*HTML' 89 r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?' 90 r'(?:\s+(\'[^\']*\'|"[^"]*"))?', 91 re.IGNORECASE).match
92 93 94 -class _PseudoTag:
95 # Minimal imitation of BeautifulSoup.Tag
96 - def __init__(self, contents):
97 self.name = 'html' 98 self.attrs = [] 99 self.contents = contents
100
101 - def __iter__(self):
102 return self.contents.__iter__()
103
104 105 -def _convert_tree(beautiful_soup_tree, makeelement):
106 if makeelement is None: 107 makeelement = html.html_parser.makeelement 108 109 # Split the tree into three parts: 110 # i) everything before the root element: document type 111 # declaration, comments, processing instructions, whitespace 112 # ii) the root(s), 113 # iii) everything after the root: comments, processing 114 # instructions, whitespace 115 first_element_idx = last_element_idx = None 116 html_root = declaration = None 117 for i, e in enumerate(beautiful_soup_tree): 118 if isinstance(e, Tag): 119 if first_element_idx is None: 120 first_element_idx = i 121 last_element_idx = i 122 if html_root is None and e.name and e.name.lower() == 'html': 123 html_root = e 124 elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE): 125 declaration = e 126 127 # For a nice, well-formatted document, the variable roots below is 128 # a list consisting of a single <html> element. However, the document 129 # may be a soup like '<meta><head><title>Hello</head><body>Hi 130 # all<\p>'. In this example roots is a list containing meta, head 131 # and body elements. 132 if first_element_idx is None: 133 pre_root = post_root = [] 134 roots = beautiful_soup_tree.contents 135 else: 136 pre_root = beautiful_soup_tree.contents[:first_element_idx] 137 roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1] 138 post_root = beautiful_soup_tree.contents[last_element_idx+1:] 139 140 # Reorganize so that there is one <html> root... 141 if html_root is not None: 142 # ... use existing one if possible, ... 143 i = roots.index(html_root) 144 html_root.contents = roots[:i] + html_root.contents + roots[i+1:] 145 else: 146 # ... otherwise create a new one. 147 html_root = _PseudoTag(roots) 148 149 convert_node = _init_node_converters(makeelement) 150 151 # Process pre_root 152 res_root = convert_node(html_root) 153 prev = res_root 154 for e in reversed(pre_root): 155 converted = convert_node(e) 156 if converted is not None: 157 prev.addprevious(converted) 158 prev = converted 159 160 # ditto for post_root 161 prev = res_root 162 for e in post_root: 163 converted = convert_node(e) 164 if converted is not None: 165 prev.addnext(converted) 166 prev = converted 167 168 if declaration is not None: 169 try: 170 # bs4 provides full Doctype string 171 doctype_string = declaration.output_ready() 172 except AttributeError: 173 doctype_string = declaration.string 174 175 match = _parse_doctype_declaration(doctype_string) 176 if not match: 177 # Something is wrong if we end up in here. Since soupparser should 178 # tolerate errors, do not raise Exception, just let it pass. 179 pass 180 else: 181 external_id, sys_uri = match.groups() 182 docinfo = res_root.getroottree().docinfo 183 # strip quotes and update DOCTYPE values (any of None, '', '...') 184 docinfo.public_id = external_id and external_id[1:-1] 185 docinfo.system_url = sys_uri and sys_uri[1:-1] 186 187 return res_root
188
189 190 -def _init_node_converters(makeelement):
191 converters = {} 192 ordered_node_types = [] 193 194 def converter(*types): 195 def add(handler): 196 for t in types: 197 converters[t] = handler 198 ordered_node_types.append(t) 199 return handler
200 return add 201 202 def find_best_converter(node): 203 for t in ordered_node_types: 204 if isinstance(node, t): 205 return converters[t] 206 return None 207 208 def convert_node(bs_node, parent=None): 209 # duplicated in convert_tag() below 210 try: 211 handler = converters[type(bs_node)] 212 except KeyError: 213 handler = converters[type(bs_node)] = find_best_converter(bs_node) 214 if handler is None: 215 return None 216 return handler(bs_node, parent) 217 218 def map_attrs(bs_attrs): 219 if isinstance(bs_attrs, dict): # bs4 220 attribs = {} 221 for k, v in bs_attrs.items(): 222 if isinstance(v, list): 223 v = " ".join(v) 224 attribs[k] = unescape(v) 225 else: 226 attribs = dict((k, unescape(v)) for k, v in bs_attrs) 227 return attribs 228 229 def append_text(parent, text): 230 if len(parent) == 0: 231 parent.text = (parent.text or '') + text 232 else: 233 parent[-1].tail = (parent[-1].tail or '') + text 234 235 # converters are tried in order of their definition 236 237 @converter(Tag, _PseudoTag) 238 def convert_tag(bs_node, parent): 239 attrs = bs_node.attrs 240 if parent is not None: 241 attribs = map_attrs(attrs) if attrs else None 242 res = etree.SubElement(parent, bs_node.name, attrib=attribs) 243 else: 244 attribs = map_attrs(attrs) if attrs else {} 245 res = makeelement(bs_node.name, attrib=attribs) 246 247 for child in bs_node: 248 # avoid double recursion by inlining convert_node(), see above 249 try: 250 handler = converters[type(child)] 251 except KeyError: 252 pass 253 else: 254 if handler is not None: 255 handler(child, res) 256 continue 257 convert_node(child, res) 258 return res 259 260 @converter(Comment) 261 def convert_comment(bs_node, parent): 262 res = html.HtmlComment(bs_node) 263 if parent is not None: 264 parent.append(res) 265 return res 266 267 @converter(ProcessingInstruction) 268 def convert_pi(bs_node, parent): 269 if bs_node.endswith('?'): 270 # The PI is of XML style (<?as df?>) but BeautifulSoup 271 # interpreted it as being SGML style (<?as df>). Fix. 272 bs_node = bs_node[:-1] 273 res = etree.ProcessingInstruction(*bs_node.split(' ', 1)) 274 if parent is not None: 275 parent.append(res) 276 return res 277 278 @converter(NavigableString) 279 def convert_text(bs_node, parent): 280 if parent is not None: 281 append_text(parent, unescape(bs_node)) 282 return None 283 284 return convert_node 285 286 287 # copied from ET's ElementSoup 288 289 try: 290 from html.entities import name2codepoint # Python 3 291 except ImportError: 292 from htmlentitydefs import name2codepoint 293 294 295 handle_entities = re.compile(r"&(\w+);").sub 296 297 298 try: 299 unichr 300 except NameError: 301 # Python 3 302 unichr = chr
303 304 305 -def unescape(string):
306 if not string: 307 return '' 308 # work around oddities in BeautifulSoup's entity handling 309 def unescape_entity(m): 310 try: 311 return unichr(name2codepoint[m.group(1)]) 312 except KeyError: 313 return m.group(0) # use as is
314 return handle_entities(unescape_entity, string) 315