lxml.html.html5parser

1 """ 2 An interface to html5lib that mimics the lxml.html interface. 3 """ 4 import sys 5 import string 6 7 from html5lib import HTMLParser as _HTMLParser 8 from html5lib.treebuilders.etree_lxml import TreeBuilder 9 from lxml import etree 10 from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag 11 12 # python3 compatibility 13 try: 14 _strings = basestring 15 except NameError: 16 _strings = (bytes, str) 17 try: 18 from urllib2 import urlopen 19 except ImportError: 20 from urllib.request import urlopen 21 try: 22 from urlparse import urlparse 23 except ImportError: 24 from urllib.parse import urlparse 25 26

27 -class HTMLParser(_HTMLParser):

28 """An html5lib HTML parser with lxml as tree.""" 29

30 - def __init__(self, strict=False, **kwargs):

31 _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)

32 33 34 try: 35 from html5lib import XHTMLParser as _XHTMLParser 36 except ImportError: 37 pass 38 else:

39 - class XHTMLParser(_XHTMLParser):

40 """An html5lib XHTML Parser with lxml as tree.""" 41

42 - def __init__(self, strict=False, **kwargs):

43 _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)

44 45 xhtml_parser = XHTMLParser() 46 47

48 -def _find_tag(tree, tag):

49 elem = tree.find(tag) 50 if elem is not None: 51 return elem 52 return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))

53 54

55 -def document_fromstring(html, guess_charset=None, parser=None):

56 """ 57 Parse a whole document into a string. 58 59 If `guess_charset` is true, or if the input is not Unicode but a 60 byte string, the `chardet` library will perform charset guessing 61 on the string. 62 """ 63 if not isinstance(html, _strings): 64 raise TypeError('string required') 65 66 if parser is None: 67 parser = html_parser 68 69 options = {} 70 if guess_charset is None and isinstance(html, bytes): 71 # html5lib does not accept useChardet as an argument, if it 72 # detected the html argument would produce unicode objects. 73 guess_charset = True 74 if guess_charset is not None: 75 options['useChardet'] = guess_charset 76 return parser.parse(html, **options).getroot()

77 78

79 -def fragments_fromstring(html, no_leading_text=False, 80 guess_charset=None, parser=None):

81 """Parses several HTML elements, returning a list of elements. 82 83 The first item in the list may be a string. If no_leading_text is true, 84 then it will be an error if there is leading text, and it will always be 85 a list of only elements. 86 87 If `guess_charset` is true, the `chardet` library will perform charset 88 guessing on the string. 89 """ 90 if not isinstance(html, _strings): 91 raise TypeError('string required') 92 93 if parser is None: 94 parser = html_parser 95 96 options = {} 97 if guess_charset is None and isinstance(html, bytes): 98 # html5lib does not accept useChardet as an argument, if it 99 # detected the html argument would produce unicode objects. 100 guess_charset = False 101 if guess_charset is not None: 102 options['useChardet'] = guess_charset 103 children = parser.parseFragment(html, 'div', **options) 104 if children and isinstance(children[0], _strings): 105 if no_leading_text: 106 if children[0].strip(): 107 raise etree.ParserError('There is leading text: %r' % 108 children[0]) 109 del children[0] 110 return children

111 112

113 -def fragment_fromstring(html, create_parent=False, 114 guess_charset=None, parser=None):

115 """Parses a single HTML element; it is an error if there is more than 116 one element, or if anything but whitespace precedes or follows the 117 element. 118 119 If 'create_parent' is true (or is a tag name) then a parent node 120 will be created to encapsulate the HTML in a single element. In 121 this case, leading or trailing text is allowed. 122 123 If `guess_charset` is true, the `chardet` library will perform charset 124 guessing on the string. 125 """ 126 if not isinstance(html, _strings): 127 raise TypeError('string required') 128 129 accept_leading_text = bool(create_parent) 130 131 elements = fragments_fromstring( 132 html, guess_charset=guess_charset, parser=parser, 133 no_leading_text=not accept_leading_text) 134 135 if create_parent: 136 if not isinstance(create_parent, _strings): 137 create_parent = 'div' 138 new_root = Element(create_parent) 139 if elements: 140 if isinstance(elements[0], _strings): 141 new_root.text = elements[0] 142 del elements[0] 143 new_root.extend(elements) 144 return new_root 145 146 if not elements: 147 raise etree.ParserError('No elements found') 148 if len(elements) > 1: 149 raise etree.ParserError('Multiple elements found') 150 result = elements[0] 151 if result.tail and result.tail.strip(): 152 raise etree.ParserError('Element followed by text: %r' % result.tail) 153 result.tail = None 154 return result

155 156

157 -def fromstring(html, guess_charset=None, parser=None):

158 """Parse the html, returning a single element/document. 159 160 This tries to minimally parse the chunk of text, without knowing if it 161 is a fragment or a document. 162 163 'base_url' will set the document's base_url attribute (and the tree's 164 docinfo.URL) 165 166 If `guess_charset` is true, or if the input is not Unicode but a 167 byte string, the `chardet` library will perform charset guessing 168 on the string. 169 """ 170 if not isinstance(html, _strings): 171 raise TypeError('string required') 172 doc = document_fromstring(html, parser=parser, 173 guess_charset=guess_charset) 174 175 # document starts with doctype or <html>, full document! 176 start = html[:50] 177 if isinstance(start, bytes): 178 # Allow text comparison in python3. 179 # Decode as ascii, that also covers latin-1 and utf-8 for the 180 # characters we need. 181 start = start.decode('ascii', 'replace') 182 183 start = start.lstrip().lower() 184 if start.startswith('<html') or start.startswith('<!doctype'): 185 return doc 186 187 head = _find_tag(doc, 'head') 188 189 # if the head is not empty we have a full document 190 if len(head): 191 return doc 192 193 body = _find_tag(doc, 'body') 194 195 # The body has just one element, so it was probably a single 196 # element passed in 197 if (len(body) == 1 and (not body.text or not body.text.strip()) 198 and (not body[-1].tail or not body[-1].tail.strip())): 199 return body[0] 200 201 # Now we have a body which represents a bunch of tags which have the 202 # content that was passed in. We will create a fake container, which 203 # is the body tag, except <body> implies too much structure. 204 if _contains_block_level_tag(body): 205 body.tag = 'div' 206 else: 207 body.tag = 'span' 208 return body

209 210

211 -def parse(filename_url_or_file, guess_charset=None, parser=None):

212 """Parse a filename, URL, or file-like object into an HTML document 213 tree. Note: this returns a tree, not an element. Use 214 ``parse(...).getroot()`` to get the document root. 215 216 If ``guess_charset`` is true, the ``useChardet`` option is passed into 217 html5lib to enable character detection. This option is on by default 218 when parsing from URLs, off by default when parsing from file(-like) 219 objects (which tend to return Unicode more often than not), and on by 220 default when parsing from a file path (which is read in binary mode). 221 """ 222 if parser is None: 223 parser = html_parser 224 if not isinstance(filename_url_or_file, _strings): 225 fp = filename_url_or_file 226 if guess_charset is None: 227 # assume that file-like objects return Unicode more often than bytes 228 guess_charset = False 229 elif _looks_like_url(filename_url_or_file): 230 fp = urlopen(filename_url_or_file) 231 if guess_charset is None: 232 # assume that URLs return bytes 233 guess_charset = True 234 else: 235 fp = open(filename_url_or_file, 'rb') 236 if guess_charset is None: 237 guess_charset = True 238 239 options = {} 240 # html5lib does not accept useChardet as an argument, if it 241 # detected the html argument would produce unicode objects. 242 if guess_charset: 243 options['useChardet'] = guess_charset 244 return parser.parse(fp, **options)

245 246

247 -def _looks_like_url(str):

248 scheme = urlparse(str)[0] 249 if not scheme: 250 return False 251 elif (sys.platform == 'win32' and 252 scheme in string.ascii_letters 253 and len(scheme) == 1): 254 # looks like a 'normal' absolute path 255 return False 256 else: 257 return True

258 259 260 html_parser = HTMLParser() 261

Source Code for Module lxml.html.html5parser