Package lxml :: Package html :: Module soupparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.soupparser

  1  """External interface to the BeautifulSoup HTML parser. 
  2  """ 
  3   
  4  __all__ = ["fromstring", "parse", "convert_tree"] 
  5   
  6  import re 
  7  from lxml import etree, html 
  8   
  9  try: 
 10      from bs4 import ( 
 11          BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, 
 12          Declaration, Doctype) 
 13      _DECLARATION_OR_DOCTYPE = (Declaration, Doctype) 
 14  except ImportError: 
 15      from BeautifulSoup import ( 
 16          BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, 
 17          Declaration) 
 18      _DECLARATION_OR_DOCTYPE = Declaration 
19 20 21 -def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
22 """Parse a string of HTML data into an Element tree using the 23 BeautifulSoup parser. 24 25 Returns the root ``<html>`` Element of the tree. 26 27 You can pass a different BeautifulSoup parser through the 28 `beautifulsoup` keyword, and a diffent Element factory function 29 through the `makeelement` keyword. By default, the standard 30 ``BeautifulSoup`` class and the default factory of `lxml.html` are 31 used. 32 """ 33 return _parse(data, beautifulsoup, makeelement, **bsargs)
34
35 36 -def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
37 """Parse a file into an ElemenTree using the BeautifulSoup parser. 38 39 You can pass a different BeautifulSoup parser through the 40 `beautifulsoup` keyword, and a diffent Element factory function 41 through the `makeelement` keyword. By default, the standard 42 ``BeautifulSoup`` class and the default factory of `lxml.html` are 43 used. 44 """ 45 if not hasattr(file, 'read'): 46 file = open(file) 47 root = _parse(file, beautifulsoup, makeelement, **bsargs) 48 return etree.ElementTree(root)
49
50 51 -def convert_tree(beautiful_soup_tree, makeelement=None):
52 """Convert a BeautifulSoup tree to a list of Element trees. 53 54 Returns a list instead of a single root Element to support 55 HTML-like soup with more than one root element. 56 57 You can pass a different Element factory through the `makeelement` 58 keyword. 59 """ 60 root = _convert_tree(beautiful_soup_tree, makeelement) 61 children = root.getchildren() 62 for child in children: 63 root.remove(child) 64 return children
65
66 67 # helpers 68 69 -def _parse(source, beautifulsoup, makeelement, **bsargs):
70 if beautifulsoup is None: 71 beautifulsoup = BeautifulSoup 72 if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3 73 if 'convertEntities' not in bsargs: 74 bsargs['convertEntities'] = 'html' 75 if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4 76 if 'features' not in bsargs: 77 bsargs['features'] = ['html.parser'] # use Python html parser 78 tree = beautifulsoup(source, **bsargs) 79 root = _convert_tree(tree, makeelement) 80 # from ET: wrap the document in a html root element, if necessary 81 if len(root) == 1 and root[0].tag == "html": 82 return root[0] 83 root.tag = "html" 84 return root
85 86 87 _parse_doctype_declaration = re.compile( 88 r'(?:\s|[<!])*DOCTYPE\s*HTML' 89 r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?' 90 r'(?:\s+(\'[^\']*\'|"[^"]*"))?', 91 re.IGNORECASE).match
92 93 94 -class _PseudoTag:
95 # Minimal imitation of BeautifulSoup.Tag
96 - def __init__(self, contents):
97 self.name = 'html' 98 self.attrs = [] 99 self.contents = contents
100
101 - def __iter__(self):
102 return self.contents.__iter__()
103
104 105 -def _convert_tree(beautiful_soup_tree, makeelement):
106 if makeelement is None: 107 makeelement = html.html_parser.makeelement 108 109 # Split the tree into three parts: 110 # i) everything before the root element: document type 111 # declaration, comments, processing instructions, whitespace 112 # ii) the root(s), 113 # iii) everything after the root: comments, processing 114 # instructions, whitespace 115 first_element_idx = last_element_idx = None 116 html_root = declaration = None 117 for i, e in enumerate(beautiful_soup_tree): 118 if isinstance(e, Tag): 119 if first_element_idx is None: 120 first_element_idx = i 121 last_element_idx = i 122 if html_root is None and e.name and e.name.lower() == 'html': 123 html_root = e 124 elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE): 125 declaration = e 126 127 # For a nice, well-formatted document, the variable roots below is 128 # a list consisting of a single <html> element. However, the document 129 # may be a soup like '<meta><head><title>Hello</head><body>Hi 130 # all<\p>'. In this example roots is a list containing meta, head 131 # and body elements. 132 pre_root = beautiful_soup_tree.contents[:first_element_idx] 133 roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1] 134 post_root = beautiful_soup_tree.contents[last_element_idx+1:] 135 136 # Reorganize so that there is one <html> root... 137 if html_root is not None: 138 # ... use existing one if possible, ... 139 i = roots.index(html_root) 140 html_root.contents = roots[:i] + html_root.contents + roots[i+1:] 141 else: 142 # ... otherwise create a new one. 143 html_root = _PseudoTag(roots) 144 145 convert_node = _init_node_converters(makeelement) 146 147 # Process pre_root 148 res_root = convert_node(html_root) 149 prev = res_root 150 for e in reversed(pre_root): 151 converted = convert_node(e) 152 if converted is not None: 153 prev.addprevious(converted) 154 prev = converted 155 156 # ditto for post_root 157 prev = res_root 158 for e in post_root: 159 converted = convert_node(e) 160 if converted is not None: 161 prev.addnext(converted) 162 prev = converted 163 164 if declaration is not None: 165 try: 166 # bs4 provides full Doctype string 167 doctype_string = declaration.output_ready() 168 except AttributeError: 169 doctype_string = declaration.string 170 171 match = _parse_doctype_declaration(doctype_string) 172 if not match: 173 # Something is wrong if we end up in here. Since soupparser should 174 # tolerate errors, do not raise Exception, just let it pass. 175 pass 176 else: 177 external_id, sys_uri = match.groups() 178 docinfo = res_root.getroottree().docinfo 179 # strip quotes and update DOCTYPE values (any of None, '', '...') 180 docinfo.public_id = external_id and external_id[1:-1] 181 docinfo.system_url = sys_uri and sys_uri[1:-1] 182 183 return res_root
184
185 186 -def _init_node_converters(makeelement):
187 converters = {} 188 ordered_node_types = [] 189 190 def converter(*types): 191 def add(handler): 192 for t in types: 193 converters[t] = handler 194 ordered_node_types.append(t) 195 return handler
196 return add 197 198 def find_best_converter(node): 199 for t in ordered_node_types: 200 if isinstance(node, t): 201 return converters[t] 202 return None 203 204 def convert_node(bs_node, parent=None): 205 # duplicated in convert_tag() below 206 try: 207 handler = converters[type(bs_node)] 208 except KeyError: 209 handler = converters[type(bs_node)] = find_best_converter(bs_node) 210 if handler is None: 211 return None 212 return handler(bs_node, parent) 213 214 def map_attrs(bs_attrs): 215 if isinstance(bs_attrs, dict): # bs4 216 attribs = {} 217 for k, v in bs_attrs.items(): 218 if isinstance(v, list): 219 v = " ".join(v) 220 attribs[k] = unescape(v) 221 else: 222 attribs = dict((k, unescape(v)) for k, v in bs_attrs) 223 return attribs 224 225 def append_text(parent, text): 226 if len(parent) == 0: 227 parent.text = (parent.text or '') + text 228 else: 229 parent[-1].tail = (parent[-1].tail or '') + text 230 231 # converters are tried in order of their definition 232 233 @converter(Tag, _PseudoTag) 234 def convert_tag(bs_node, parent): 235 attrs = bs_node.attrs 236 if parent is not None: 237 attribs = map_attrs(attrs) if attrs else None 238 res = etree.SubElement(parent, bs_node.name, attrib=attribs) 239 else: 240 attribs = map_attrs(attrs) if attrs else {} 241 res = makeelement(bs_node.name, attrib=attribs) 242 243 for child in bs_node: 244 # avoid double recursion by inlining convert_node(), see above 245 try: 246 handler = converters[type(child)] 247 except KeyError: 248 pass 249 else: 250 if handler is not None: 251 handler(child, res) 252 continue 253 convert_node(child, res) 254 return res 255 256 @converter(Comment) 257 def convert_comment(bs_node, parent): 258 res = etree.Comment(bs_node) 259 if parent is not None: 260 parent.append(res) 261 return res 262 263 @converter(ProcessingInstruction) 264 def convert_pi(bs_node, parent): 265 if bs_node.endswith('?'): 266 # The PI is of XML style (<?as df?>) but BeautifulSoup 267 # interpreted it as being SGML style (<?as df>). Fix. 268 bs_node = bs_node[:-1] 269 res = etree.ProcessingInstruction(*bs_node.split(' ', 1)) 270 if parent is not None: 271 parent.append(res) 272 return res 273 274 @converter(NavigableString) 275 def convert_text(bs_node, parent): 276 if parent is not None: 277 append_text(parent, unescape(bs_node)) 278 return None 279 280 return convert_node 281 282 283 # copied from ET's ElementSoup 284 285 try: 286 from html.entities import name2codepoint # Python 3 287 except ImportError: 288 from htmlentitydefs import name2codepoint 289 290 291 handle_entities = re.compile(r"&(\w+);").sub 292 293 294 try: 295 unichr 296 except NameError: 297 # Python 3 298 unichr = chr
299 300 301 -def unescape(string):
302 if not string: 303 return '' 304 # work around oddities in BeautifulSoup's entity handling 305 def unescape_entity(m): 306 try: 307 return unichr(name2codepoint[m.group(1)]) 308 except KeyError: 309 return m.group(0) # use as is
310 return handle_entities(unescape_entity, string) 311