Package lxml :: Package html :: Package tests :: Module transform_feedparser_data
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.tests.transform_feedparser_data

  1  """ 
  2  This takes the feedparser tests from here: 
  3   
  4    http://feedparser.org/tests/wellformed/sanitize/ 
  5   
  6  and rewrites them to be easier to handle (not using the internal model 
  7  of feedparser).  The input format is:: 
  8   
  9    <!-- 
 10    Description: {description} 
 11    Expect: {expression} 
 12    --> 
 13    ... 
 14    <content ...>{content}</content> 
 15    ... 
 16   
 17  The Expect expression is checked for 
 18  ``entries[0]['content'][0]['value'] == {data}``. 
 19   
 20  The output format is:: 
 21   
 22    Description: {description} 
 23    Expect: {expression} (if data couldn't be parsed) 
 24    Options:  
 25   
 26    {content, unescaped} 
 27    ---------- 
 28    {data, unescaped, if found} 
 29   
 30  """ 
 31   
 32  import re 
 33  import os 
 34  import traceback 
 35   
 36  _desc_re = re.compile(r'\s*Description:\s*(.*)') 
 37  _expect_re = re.compile(r'\s*Expect:\s*(.*)') 
 38  _data_expect_re = re.compile(r"entries\[0\]\['[^']+'\](?:\[0\]\['value'\])?\s*==\s*(.*)") 
 39  _feed_data_expect_re = re.compile(r"feed\['[^']+'\]\s*==\s*(.*)") 
 40   
41 -def parse_content(content):
42 match = _desc_re.search(content) 43 desc = match.group(1) 44 match = _expect_re.search(content) 45 expect = match.group(1) 46 data = None 47 for regex in [_data_expect_re, _feed_data_expect_re]: 48 match = regex.search(expect) 49 if match: 50 # Icky, but I'll trust it 51 data = eval(match.group(1).strip()) 52 break 53 c = None 54 for tag in ['content', 'summary', 'title', 'copyright', 'tagline', 'info', 'subtitle', 'fullitem', 'body', 'description', 'content:encoded']: 55 regex = re.compile(r"<%s.*?>(.*)</%s>" % (tag, tag), re.S) 56 match = regex.search(content) 57 if match: 58 c = match.group(1) 59 break 60 assert c is not None 61 # Seems like body isn't quoted 62 if tag != 'body': 63 c = c.replace('&lt;', '<') 64 c = c.replace('&amp;', '&') 65 # FIXME: I should really do more unescaping... 66 return { 67 'Description': desc, 68 'Expect': expect, 69 'data': data, 70 'content': c}
71
72 -def serialize_content(d):
73 s = '''\ 74 Description: %(Description)s 75 Expect: %(Expect)s 76 Options: 77 78 %(content)s 79 ''' % d 80 if d.get('data') is not None: 81 s += '----------\n%s' % d['data'] 82 return s
83
84 -def translate_file(filename):
85 f = open(filename, 'rb') 86 c = f.read() 87 f.close() 88 try: 89 output = serialize_content(parse_content(c)) 90 except: 91 print 'Bad data in %s:' % filename 92 print c 93 traceback.print_exc() 94 print '-'*60 95 return 96 new = os.path.splitext(filename)[0] + '.data' 97 f = open(new, 'wb') 98 f.write(output) 99 f.close()
100
101 -def translate_all(dir):
102 for fn in os.listdir(dir): 103 fn = os.path.join(dir, fn) 104 if fn.endswith('.xml'): 105 translate_file(fn)
106 107 if __name__ == '__main__': 108 import sys 109 translate_all(os.path.join(os.path.dirname(__file__), 'feedparser-data')) 110