Package lxml :: Package html :: Module html5parser
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.html5parser

  1  """ 
  2  An interface to html5lib that mimics the lxml.html interface. 
  3  """ 
  4   
  5  from html5lib import HTMLParser as _HTMLParser 
  6  from html5lib.treebuilders.etree_lxml import TreeBuilder 
  7   
  8  from lxml import etree 
  9  from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element 
 10   
 11  # python3 compatibility 
 12  try: 
 13      _strings = basestring 
 14  except NameError: 
 15      _strings = (bytes, str) 
 16  try: 
 17      from urllib2 import urlopen 
 18  except ImportError: 
 19      from urllib.request import urlopen 
 20  try: 
 21      from urlparse import urlparse 
 22  except ImportError: 
 23      from urllib.parse import urlparse 
 24   
25 -class HTMLParser(_HTMLParser):
26 """An html5lib HTML parser with lxml as tree.""" 27
28 - def __init__(self, strict=False, **kwargs):
29 _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
30 31 32 try: 33 from html5lib import XHTMLParser as _XHTMLParser 34 except ImportError: 35 pass 36 else:
37 - class XHTMLParser(_XHTMLParser):
38 """An html5lib XHTML Parser with lxml as tree.""" 39
40 - def __init__(self, strict=False, **kwargs):
41 _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
42 43 xhtml_parser = XHTMLParser() 44 45
46 -def _find_tag(tree, tag):
47 elem = tree.find(tag) 48 if elem is not None: 49 return elem 50 return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
51 52
53 -def document_fromstring(html, guess_charset=True, parser=None):
54 """Parse a whole document into a string.""" 55 if not isinstance(html, _strings): 56 raise TypeError('string required') 57 58 if parser is None: 59 parser = html_parser 60 61 return parser.parse(html, useChardet=guess_charset).getroot()
62 63
64 -def fragments_fromstring(html, no_leading_text=False, 65 guess_charset=False, parser=None):
66 """Parses several HTML elements, returning a list of elements. 67 68 The first item in the list may be a string. If no_leading_text is true, 69 then it will be an error if there is leading text, and it will always be 70 a list of only elements. 71 72 If `guess_charset` is `True` and the text was not unicode but a 73 bytestring, the `chardet` library will perform charset guessing on the 74 string. 75 """ 76 if not isinstance(html, _strings): 77 raise TypeError('string required') 78 79 if parser is None: 80 parser = html_parser 81 82 children = parser.parseFragment(html, 'div', useChardet=guess_charset) 83 if children and isinstance(children[0], _strings): 84 if no_leading_text: 85 if children[0].strip(): 86 raise etree.ParserError('There is leading text: %r' % 87 children[0]) 88 del children[0] 89 return children
90 91
92 -def fragment_fromstring(html, create_parent=False, 93 guess_charset=False, parser=None):
94 """Parses a single HTML element; it is an error if there is more than 95 one element, or if anything but whitespace precedes or follows the 96 element. 97 98 If create_parent is true (or is a tag name) then a parent node 99 will be created to encapsulate the HTML in a single element. In 100 this case, leading or trailing text is allowed. 101 """ 102 if not isinstance(html, _strings): 103 raise TypeError('string required') 104 105 accept_leading_text = bool(create_parent) 106 107 elements = fragments_fromstring( 108 html, guess_charset=guess_charset, parser=parser, 109 no_leading_text=not accept_leading_text) 110 111 if create_parent: 112 if not isinstance(create_parent, _strings): 113 create_parent = 'div' 114 new_root = Element(create_parent) 115 if elements: 116 if isinstance(elements[0], _strings): 117 new_root.text = elements[0] 118 del elements[0] 119 new_root.extend(elements) 120 return new_root 121 122 if not elements: 123 raise etree.ParserError('No elements found') 124 if len(elements) > 1: 125 raise etree.ParserError('Multiple elements found') 126 result = elements[0] 127 if result.tail and result.tail.strip(): 128 raise etree.ParserError('Element followed by text: %r' % result.tail) 129 result.tail = None 130 return result
131 132
133 -def fromstring(html, guess_charset=True, parser=None):
134 """Parse the html, returning a single element/document. 135 136 This tries to minimally parse the chunk of text, without knowing if it 137 is a fragment or a document. 138 139 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 140 """ 141 if not isinstance(html, _strings): 142 raise TypeError('string required') 143 doc = document_fromstring(html, parser=parser, 144 guess_charset=guess_charset) 145 146 # document starts with doctype or <html>, full document! 147 start = html[:50].lstrip().lower() 148 if start.startswith('<html') or start.startswith('<!doctype'): 149 return doc 150 151 head = _find_tag(doc, 'head') 152 153 # if the head is not empty we have a full document 154 if len(head): 155 return doc 156 157 body = _find_tag(doc, 'body') 158 159 # The body has just one element, so it was probably a single 160 # element passed in 161 if (len(body) == 1 and (not body.text or not body.text.strip()) 162 and (not body[-1].tail or not body[-1].tail.strip())): 163 return body[0] 164 165 # Now we have a body which represents a bunch of tags which have the 166 # content that was passed in. We will create a fake container, which 167 # is the body tag, except <body> implies too much structure. 168 if _contains_block_level_tag(body): 169 body.tag = 'div' 170 else: 171 body.tag = 'span' 172 return body
173 174
175 -def parse(filename_url_or_file, guess_charset=True, parser=None):
176 """Parse a filename, URL, or file-like object into an HTML document 177 tree. Note: this returns a tree, not an element. Use 178 ``parse(...).getroot()`` to get the document root. 179 """ 180 if parser is None: 181 parser = html_parser 182 if not isinstance(filename_url_or_file, _strings): 183 fp = filename_url_or_file 184 elif _looks_like_url(filename_url_or_file): 185 fp = urlopen(filename_url_or_file) 186 else: 187 fp = open(filename_url_or_file, 'rb') 188 return parser.parse(fp, useChardet=guess_charset)
189
190 -def _looks_like_url(str):
191 scheme = urlparse(str)[0] 192 return scheme != ''
193 194 html_parser = HTMLParser() 195