lxml.html.tests.transform_feedparser

1 """ 2 This takes the feedparser tests from here: 3 4 http://feedparser.org/tests/wellformed/sanitize/ 5 6 and rewrites them to be easier to handle (not using the internal model 7 of feedparser). The input format is:: 8 9  13 ... 14 <content ...>{content}</content> 15 ... 16 17 The Expect expression is checked for 18 ``entries[0]['content'][0]['value'] == {data}``. 19 20 The output format is:: 21 22 Description: {description} 23 Expect: {expression} (if data couldn't be parsed) 24 Options: 25 26 {content, unescaped} 27 ---------- 28 {data, unescaped, if found} 29 30 """ 31 32 import re 33 import os 34 import traceback 35 36 _desc_re = re.compile(r'\s*Description:\s*(.*)') 37 _expect_re = re.compile(r'\s*Expect:\s*(.*)') 38 _data_expect_re = re.compile(r"entries\[0\]\['[^']+'\](?:\[0\]\['value'\])?\s*==\s*(.*)") 39 _feed_data_expect_re = re.compile(r"feed\['[^']+'\]\s*==\s*(.*)") 40

41 -def parse_content(content):

42 match = _desc_re.search(content) 43 desc = match.group(1) 44 match = _expect_re.search(content) 45 expect = match.group(1) 46 data = None 47 for regex in [_data_expect_re, _feed_data_expect_re]: 48 match = regex.search(expect) 49 if match: 50 # Icky, but I'll trust it 51 data = eval(match.group(1).strip()) 52 break 53 c = None 54 for tag in ['content', 'summary', 'title', 'copyright', 'tagline', 'info', 'subtitle', 'fullitem', 'body', 'description', 'content:encoded']: 55 regex = re.compile(r"<%s.*?>(.*)</%s>" % (tag, tag), re.S) 56 match = regex.search(content) 57 if match: 58 c = match.group(1) 59 break 60 assert c is not None 61 # Seems like body isn't quoted 62 if tag != 'body': 63 c = c.replace('<', '<') 64 c = c.replace('&', '&') 65 # FIXME: I should really do more unescaping... 66 return { 67 'Description': desc, 68 'Expect': expect, 69 'data': data, 70 'content': c}

71

72 -def serialize_content(d):

73 s = '''\ 74 Description: %(Description)s 75 Expect: %(Expect)s 76 Options: 77 78 %(content)s 79 ''' % d 80 if d.get('data') is not None: 81 s += '----------\n%s' % d['data'] 82 return s

83

84 -def translate_file(filename):

85 f = open(filename, 'rb') 86 c = f.read() 87 f.close() 88 try: 89 output = serialize_content(parse_content(c)) 90 except: 91 print 'Bad data in %s:' % filename 92 print c 93 traceback.print_exc() 94 print '-'*60 95 return 96 new = os.path.splitext(filename)[0] + '.data' 97 f = open(new, 'wb') 98 f.write(output) 99 f.close()

100

101 -def translate_all(dir):

102 for fn in os.listdir(dir): 103 fn = os.path.join(dir, fn) 104 if fn.endswith('.xml'): 105 translate_file(fn)

106 107 if __name__ == '__main__': 108 import sys 109 translate_all(os.path.join(os.path.dirname(__file__), 'feedparser-data')) 110

Source Code for Module lxml.html.tests.transform_feedparser_data