Package lxml :: Package html :: Module html5parser
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.html5parser

  1  """ 
  2  An interface to html5lib that mimics the lxml.html interface. 
  3  """ 
  4   
  5  import urllib 
  6   
  7  from html5lib import HTMLParser as _HTMLParser 
  8  from html5lib.treebuilders.etree_lxml import TreeBuilder 
  9   
 10  from lxml import etree 
 11  from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE 
 12   
 13  # python3 compatibility 
 14  try: 
 15      _strings = basestring 
 16  except NameError: 
 17      _strings = (bytes, str) 
 18   
 19   
20 -class HTMLParser(_HTMLParser):
21 """An html5lib HTML parser with lxml as tree.""" 22
23 - def __init__(self, strict=False, **kwargs):
24 _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
25 26 27 try: 28 from html5lib import XHTMLParser as _XHTMLParser 29 except ImportError: 30 pass 31 else:
32 - class XHTMLParser(_XHTMLParser):
33 """An html5lib XHTML Parser with lxml as tree.""" 34
35 - def __init__(self, strict=False, **kwargs):
36 _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
37 38 xhtml_parser = XHTMLParser() 39 40
41 -def _find_tag(tree, tag):
42 elem = tree.find(tag) 43 if elem is not None: 44 return elem 45 return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
46 47
48 -def document_fromstring(html, guess_charset=True, parser=None):
49 """Parse a whole document into a string.""" 50 if not isinstance(html, _strings): 51 raise TypeError('string required') 52 53 if parser is None: 54 parser = html_parser 55 56 return parser.parse(html, useChardet=guess_charset).getroot()
57 58
59 -def fragments_fromstring(html, no_leading_text=False, 60 guess_charset=False, parser=None):
61 """Parses several HTML elements, returning a list of elements. 62 63 The first item in the list may be a string. If no_leading_text is true, 64 then it will be an error if there is leading text, and it will always be 65 a list of only elements. 66 67 If `guess_charset` is `True` and the text was not unicode but a 68 bytestring, the `chardet` library will perform charset guessing on the 69 string. 70 """ 71 if not isinstance(html, _strings): 72 raise TypeError('string required') 73 74 if parser is None: 75 parser = html_parser 76 77 children = parser.parseFragment(html, 'div', useChardet=guess_charset) 78 if children and isinstance(children[0], _strings): 79 if no_leading_text: 80 if children[0].strip(): 81 raise etree.ParserError('There is leading text: %r' % 82 children[0]) 83 del children[0] 84 return children
85 86
87 -def fragment_fromstring(html, create_parent=False, 88 guess_charset=False, parser=None):
89 """Parses a single HTML element; it is an error if there is more than 90 one element, or if anything but whitespace precedes or follows the 91 element. 92 93 If create_parent is true (or is a tag name) then a parent node 94 will be created to encapsulate the HTML in a single element. In 95 this case, leading or trailing text is allowed. 96 """ 97 if not isinstance(html, _strings): 98 raise TypeError('string required') 99 100 accept_leading_text = bool(create_parent) 101 102 elements = fragments_fromstring( 103 html, guess_charset=guess_charset, parser=parser, 104 no_leading_text=not accept_leading_text, **kw) 105 106 if create_parent: 107 if not isinstance(create_parent, basestring): 108 create_parent = 'div' 109 new_root = Element(create_parent) 110 if elements: 111 if isinstance(elements[0], basestring): 112 new_root.text = elements[0] 113 del elements[0] 114 new_root.extend(elements) 115 return new_root 116 117 if not elements: 118 raise etree.ParserError('No elements found') 119 if len(elements) > 1: 120 raise etree.ParserError('Multiple elements found') 121 result = elements[0] 122 if result.tail and result.tail.strip(): 123 raise etree.ParserError('Element followed by text: %r' % result.tail) 124 result.tail = None 125 return result
126 127
128 -def fromstring(html, guess_charset=True, parser=None):
129 """Parse the html, returning a single element/document. 130 131 This tries to minimally parse the chunk of text, without knowing if it 132 is a fragment or a document. 133 134 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 135 """ 136 if not isinstance(html, _strings): 137 raise TypeError('string required') 138 doc = document_fromstring(html, parser=parser, 139 guess_charset=guess_charset) 140 141 # document starts with doctype or <html>, full document! 142 start = html[:50].lstrip().lower() 143 if start.startswith('<html') or start.startswith('<!doctype'): 144 return doc 145 146 head = _find_tag(doc, 'head') 147 148 # if the head is not empty we have a full document 149 if len(head): 150 return doc 151 152 body = _find_tag(doc, 'body') 153 154 # The body has just one element, so it was probably a single 155 # element passed in 156 if (len(body) == 1 and (not body.text or not body.text.strip()) 157 and (not body[-1].tail or not body[-1].tail.strip())): 158 return body[0] 159 160 # Now we have a body which represents a bunch of tags which have the 161 # content that was passed in. We will create a fake container, which 162 # is the body tag, except <body> implies too much structure. 163 if _contains_block_level_tag(body): 164 body.tag = 'div' 165 else: 166 body.tag = 'span' 167 return body
168 169
170 -def parse(filename_url_or_file, guess_charset=True, parser=None):
171 """Parse a filename, URL, or file-like object into an HTML document 172 tree. Note: this returns a tree, not an element. Use 173 ``parse(...).getroot()`` to get the document root. 174 """ 175 if parser is None: 176 parser = html_parser 177 if isinstance(filename_url_or_file, basestring): 178 fp = urllib.urlopen(filename_url_or_file) 179 else: 180 fp = filename_url_or_file 181 return parser.parse(fp, useChardet=guess_charset)
182 183 184 html_parser = HTMLParser() 185