A cleanup tool for HTML.
character
|
unichr(i)
Return a string of one character with ordinal i; 0 <= i < 256. |
|
|
|
_substitute_whitespace(...)
sub(repl, string[, count = 0]) --> newstring
Return the string obtained by replacing the leftmost non-overlapping
occurrences of pattern in string by the replacement repl. |
source code
|
|
|
|
|
autolink(el,
link_regexes=[ re.compile(r'(?i) (?P< body > https? ://(?P< host > [ a- z0- 9\._-] + ) (?: ... ,
avoid_elements=[ ' textarea ' , ' pre ' , ' code ' , ' head ' , ' select ' , ' a ' ] ,
avoid_hosts=[ re.compile(r'(?i) ^localhost'), re.compile(r'(?i) \bexample\.(? ... ,
avoid_classes=[ ' nolink ' ] )
Turn any URLs into links. |
source code
|
|
|
_link_text(text,
link_regexes,
avoid_hosts,
factory) |
source code
|
|
|
|
|
word_break(el,
max_width=40,
avoid_elements=[ ' pre ' , ' textarea ' , ' code ' ] ,
avoid_classes=[ ' nobreak ' ] ,
break_character=u' ' )
Breaks any long words found in the body of the text (not attributes). |
source code
|
|
|
|
|
_break_text(text,
max_width,
break_character) |
source code
|
|
|
_insert_break(word,
width,
break_character) |
source code
|
|
|
basestring = str, bytes
|
|
_css_javascript_re = re.compile(r'(?is) expression\s* \(.*? \)')
|
|
_css_import_re = re.compile(r'(?i) @\s* import')
|
|
_javascript_scheme_re = re.compile(r'(?i) \s* (?: javascript| jscr...
|
|
_conditional_comment_re = re.compile(r'(?is) \[if[ \s\n\r] + .*? \]...
|
|
_find_styled_elements = descendant-or-self::*[@style]
|
|
_find_external_links = descendant-or-self::a [normalize-space...
|
|
clean = Cleaner()
|
|
_link_regexes = [ re.compile(r'(?i) (?P< body > https? ://(?P< host > [ ...
|
|
_avoid_elements = [ ' textarea ' , ' pre ' , ' code ' , ' head ' , ' select ' ...
|
|
_avoid_hosts = [ re.compile(r'(?i) ^localhost'), re.compile(r'(? ...
|
|
_avoid_classes = [ ' nolink ' ]
|
|
_avoid_word_break_elements = [ ' pre ' , ' textarea ' , ' code ' ]
|
|
_avoid_word_break_classes = [ ' nobreak ' ]
|
|
_break_prefer_re = re.compile(r'(?i) [ ^ a- z] ')
|
|
__package__ = ' lxml.html '
|