Package lxml :: Package html :: Module html5parser
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.html5parser

  1  """ 
  2  An interface to html5lib that mimics the lxml.html interface. 
  3  """ 
  4   
  5  import sys 
  6  import string 
  7   
  8  from html5lib import HTMLParser as _HTMLParser 
  9  from html5lib.treebuilders.etree_lxml import TreeBuilder 
 10   
 11  from lxml import etree 
 12  from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element 
 13   
 14  # python3 compatibility 
 15  try: 
 16      _strings = basestring 
 17  except NameError: 
 18      _strings = (bytes, str) 
 19  try: 
 20      from urllib2 import urlopen 
 21  except ImportError: 
 22      from urllib.request import urlopen 
 23  try: 
 24      from urlparse import urlparse 
 25  except ImportError: 
 26      from urllib.parse import urlparse 
 27   
28 -class HTMLParser(_HTMLParser):
29 """An html5lib HTML parser with lxml as tree.""" 30
31 - def __init__(self, strict=False, **kwargs):
32 _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
33 34 35 try: 36 from html5lib import XHTMLParser as _XHTMLParser 37 except ImportError: 38 pass 39 else:
40 - class XHTMLParser(_XHTMLParser):
41 """An html5lib XHTML Parser with lxml as tree.""" 42
43 - def __init__(self, strict=False, **kwargs):
44 _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
45 46 xhtml_parser = XHTMLParser() 47 48
49 -def _find_tag(tree, tag):
50 elem = tree.find(tag) 51 if elem is not None: 52 return elem 53 return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
54 55
56 -def document_fromstring(html, guess_charset=True, parser=None):
57 """Parse a whole document into a string.""" 58 if not isinstance(html, _strings): 59 raise TypeError('string required') 60 61 if parser is None: 62 parser = html_parser 63 64 return parser.parse(html, useChardet=guess_charset).getroot()
65 66
67 -def fragments_fromstring(html, no_leading_text=False, 68 guess_charset=False, parser=None):
69 """Parses several HTML elements, returning a list of elements. 70 71 The first item in the list may be a string. If no_leading_text is true, 72 then it will be an error if there is leading text, and it will always be 73 a list of only elements. 74 75 If `guess_charset` is `True` and the text was not unicode but a 76 bytestring, the `chardet` library will perform charset guessing on the 77 string. 78 """ 79 if not isinstance(html, _strings): 80 raise TypeError('string required') 81 82 if parser is None: 83 parser = html_parser 84 85 children = parser.parseFragment(html, 'div', useChardet=guess_charset) 86 if children and isinstance(children[0], _strings): 87 if no_leading_text: 88 if children[0].strip(): 89 raise etree.ParserError('There is leading text: %r' % 90 children[0]) 91 del children[0] 92 return children
93 94
95 -def fragment_fromstring(html, create_parent=False, 96 guess_charset=False, parser=None):
97 """Parses a single HTML element; it is an error if there is more than 98 one element, or if anything but whitespace precedes or follows the 99 element. 100 101 If create_parent is true (or is a tag name) then a parent node 102 will be created to encapsulate the HTML in a single element. In 103 this case, leading or trailing text is allowed. 104 """ 105 if not isinstance(html, _strings): 106 raise TypeError('string required') 107 108 accept_leading_text = bool(create_parent) 109 110 elements = fragments_fromstring( 111 html, guess_charset=guess_charset, parser=parser, 112 no_leading_text=not accept_leading_text) 113 114 if create_parent: 115 if not isinstance(create_parent, _strings): 116 create_parent = 'div' 117 new_root = Element(create_parent) 118 if elements: 119 if isinstance(elements[0], _strings): 120 new_root.text = elements[0] 121 del elements[0] 122 new_root.extend(elements) 123 return new_root 124 125 if not elements: 126 raise etree.ParserError('No elements found') 127 if len(elements) > 1: 128 raise etree.ParserError('Multiple elements found') 129 result = elements[0] 130 if result.tail and result.tail.strip(): 131 raise etree.ParserError('Element followed by text: %r' % result.tail) 132 result.tail = None 133 return result
134 135
136 -def fromstring(html, guess_charset=True, parser=None):
137 """Parse the html, returning a single element/document. 138 139 This tries to minimally parse the chunk of text, without knowing if it 140 is a fragment or a document. 141 142 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 143 """ 144 if not isinstance(html, _strings): 145 raise TypeError('string required') 146 doc = document_fromstring(html, parser=parser, 147 guess_charset=guess_charset) 148 149 # document starts with doctype or <html>, full document! 150 start = html[:50] 151 if isinstance(start, bytes): 152 # Allow text comparison in python3. 153 # Decode as ascii, that also covers latin-1 and utf-8 for the 154 # characters we need. 155 start = start.decode('ascii', 'replace') 156 157 start = start.lstrip().lower() 158 if start.startswith('<html') or start.startswith('<!doctype'): 159 return doc 160 161 head = _find_tag(doc, 'head') 162 163 # if the head is not empty we have a full document 164 if len(head): 165 return doc 166 167 body = _find_tag(doc, 'body') 168 169 # The body has just one element, so it was probably a single 170 # element passed in 171 if (len(body) == 1 and (not body.text or not body.text.strip()) 172 and (not body[-1].tail or not body[-1].tail.strip())): 173 return body[0] 174 175 # Now we have a body which represents a bunch of tags which have the 176 # content that was passed in. We will create a fake container, which 177 # is the body tag, except <body> implies too much structure. 178 if _contains_block_level_tag(body): 179 body.tag = 'div' 180 else: 181 body.tag = 'span' 182 return body
183 184
185 -def parse(filename_url_or_file, guess_charset=True, parser=None):
186 """Parse a filename, URL, or file-like object into an HTML document 187 tree. Note: this returns a tree, not an element. Use 188 ``parse(...).getroot()`` to get the document root. 189 """ 190 if parser is None: 191 parser = html_parser 192 if not isinstance(filename_url_or_file, _strings): 193 fp = filename_url_or_file 194 elif _looks_like_url(filename_url_or_file): 195 fp = urlopen(filename_url_or_file) 196 else: 197 fp = open(filename_url_or_file, 'rb') 198 return parser.parse(fp, useChardet=guess_charset)
199 200
201 -def _looks_like_url(str):
202 scheme = urlparse(str)[0] 203 if not scheme: 204 return False 205 elif (sys.platform == 'win32' and 206 scheme in string.ascii_letters 207 and len(scheme) == 1): 208 # looks like a 'normal' absolute path 209 return False 210 else: 211 return True
212 213 214 html_parser = HTMLParser() 215