""" Manipulate HTML or XHTML documents. Version 1.0.2. This source code has been placed in the public domain by Connelly Barnes. Features: - Translate HTML back and forth to data structures. This allows you to read and write HTML documents programmably, with much flexibility. - Extract and modify URLs in an HTML document. """ # ------------------------------------------------------------------- # Globals # ------------------------------------------------------------------- import re import shlex import string import urllib import urlparse # Translate text between these strings as plain text (not HTML). _IGNORE_TAGS = [('script', '/script'), ('style', '/style')] _BEGIN_COMMENT = '' # ------------------------------------------------------------------- # HTML <-> Data structure # ------------------------------------------------------------------- def tagextract(doc): """ Convert HTML to data structure. Returns a list. HTML tags become (name, keyword_dict) tuples within the list, while plain text becomes strings within the list. All tag names are lowercased and stripped of whitespace. Tags which end with forward slashes have a single forward slash placed at the end of their name, to indicate that they are XML unclosed tags. Example: >>> tagextract('hifoo

') [('img', {'src': 'hi.gif', 'alt': 'hi'}), 'foo', ('br', {}), ('br/', {}), ('/body', {})] Text between '') [('script', {'type': 'a'}), 'var x; ', ('/script', {})] Text inside the comment strings '' is also rendered as plain text. Opening and closing comments are translated into ('!--', {}) and ('--', {}), respectively. Example: >>> tagextract('') ['!--', ' blah ', '--'] """ L = _full_tag_extract(doc) for i in range(len(L)): if isinstance(L[i], _TextTag): # _TextTag object. L[i] = L[i].text else: # _HTMLTag object. L[i] = (L[i].name, L[i].attrs) return L def tagjoin(L): """ Convert data structure back to HTML. This reverses the tagextract() function. More precisely, if an HTML string is turned into a data structure, then back into HTML, the resulting string will be functionally equivalent to the original HTML. >>> tagjoin(tagextract(s)) (string that is functionally equivalent to s) Three changes are made to the HTML by tagjoin(): tags are lowercased, key=value pairs are sorted, and values are placed in double-quotes. """ ans = [] for item in L: if isinstance(item, str): # Handle plain text. ans.append(item) elif item[0] == '--': # Handle closing comment. ans.append('-->') elif item[0] == '!--': # Handle opening comment. ans.append('' + \ ' end') [' blah', '', '', \ ' ', '', 'end'] """ s_lower = s.lower() L = [] i = 0 # Index of char being processed while i < len(s): c = s[i] if c == '<': # Left bracket, handle various cases. if s[i:i+len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT): # HTML begin comment tag, ''. L.append(_BEGIN_COMMENT) i += len(_BEGIN_COMMENT) i2 = s.find(_END_COMMENT, i) if i2 < 0: # No '-->'. Append the rest as text. L.append(s[i:]) break else: # Append the comment text. L.append(s[i:i2]) # Then append the '-->' as a tag. L.append(s[i2:i2+len(_END_COMMENT)]) i = i2 + len(_END_COMMENT) else: # Regular HTML tag. Scan for '>'. orig_i = i i2 = s.find('>', i + 1) if i2 < 0: # No end '>'. Append the rest as text. L.append(s[i:]) break else: # Append the tag. L.append(s[i:i2+1]) i = i2 + 1 # Check whether we found a special ignore tag, eg '' doc3 = '\r\t< html >< tag> ' + \ '' # ----------------------------------------------------------------- # Test _html_split() # ----------------------------------------------------------------- s = doc1 assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['\n\n', '', '', 'Hi', '

', 'Ho', \ '

', '
', '
', '', \ '', '', '', '', \ '\nBye!\n'] s = doc2 assert s == ''.join(_html_split(s)) s = '

Header' + \ '

' assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', ' ', \ '

', 'Header', '

'] s = ' blah ok whata' assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', ' blah ok ', '', \ ' what', '', \ '', 'a'] s = '! -' + \ '' assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', '!', '', '', \ ' ', '', ' ', '', \ ' ', '', '-', '', \ ''] # ----------------------------------------------------------------- # Test tagextract() and tagjoin() # ----------------------------------------------------------------- s = doc1 assert tagextract('') == [] assert tagextract(s) == \ ['\n\n', ('html', {}), ('body', {'bgcolor': '#ffffff'}), \ 'Hi', ('h1', {}), 'Ho', ('/h1', {}), ('br', {}), \ ('br/', {}), ('img', {'src': 'text%5f.gif'}), \ ('tag', {'noshow': None}), ('img/', {'test': '5%ff'}), \ ('/body', {}), ('/html', {}), '\nBye!\n'] s2 = '\n\nHi

Ho


' + \ '
' + \ '\nBye!\n' assert tagjoin(tagextract(s)) == s2 doc2old = doc2 doc2 = '\r' + \ '' + \ 'end '+ \ '' assert doc2old == doc2 # FIXME s = doc2 assert tagextract(s) == \ ['\r', ('html', {}), ('!--', {}), ' Comment ', \ ('--', {}), ('hiya', {}), ('foo', {}), \ ('test', {'content': '6', 'tag': '5'}), \ ('is', {'broken': 'False'}), ('yay', {}), ('style', {}), '<><>><', \ ('/style', {}), ('foo', {'bar': '5'}), 'end', ('!--', {}), \ ' !_-', {}), \ ('/script', {})] assert tagjoin(tagextract(s)) == \ '\rend ' + \ '' # ----------------------------------------------------------------- # Test _full_tag_extract() # ----------------------------------------------------------------- for s in [doc1, doc2, doc3]: L = _full_tag_extract(s) for (i, item) in enumerate(L): if isinstance(item, _HTMLTag): for key in item.attrs: assert s[item.key_pos[key][0]:item.key_pos[key][1]].lower()\ == key if item.attrs[key] != None: assert s[item.value_pos[key][0]:item.value_pos[key][1]] \ == item.attrs[key] n = 1000 doc4 = ''*n L = tagextract(doc4) assert len(L) == n for i in range(n): assert L[i] == ('tag/',{'name':'5','value':'6afdjherknc4 cdk j', \ 'a':'7', 'b':'8'}) # ------------------------------------------------------------------- # Unit Tests: URL Parsing # ------------------------------------------------------------------- def _test_urlextract(): """ Unit tests for urlextract() and urljoin(). """ doc1 = 'urlblah, url ( blah2, url( blah3) url(blah4) ' + \ 'url("blah5") hum("blah6") url)"blah7"( url ( " blah8 " );;' doc2 = 'b' + \ 'http://www.ignore.us/' + \ '\nhttp://www.nowhere.com c' # Test CSS. s = doc1 L = urlextract(s, mimetype='text/css') L2 = [x.url for x in L] assert L2 == [' blah3', 'blah4', 'blah5', ' blah8 '] # Test HTML. s = doc2 L = urlextract(s) L2 = [x.url for x in L] ans = ['a.gif', 'b.html', './c.png', \ 'http://www.abc.edu/d.tga', 'h.gif', \ 'http://www.testdomain.com/', 'a.gif', '/i.png'] assert L2 == ans for i in range(len(L)): assert s[L[i].start:L[i].end] == L[i].url # Test HTML more. n = 100 s2 = s * n L3 = urlextract(s2) L4 = [x.url for x in L3] assert L4 == L2 * n for i in range(len(L3)): assert s2[L3[i].start:L3[i].end] == L3[i].url # Test HTML w/ siteurl. base = 'http://www.python.org/~guido/' L = urlextract(s, base) L2 = [x.url for x in L] assert L2 == [urlparse.urljoin(base, x) for x in ans] # Test urljoin(). assert urljoin(doc1, urlextract(doc1, mimetype='text/css')) == doc1 assert urljoin(doc2, urlextract(doc2)) == doc2 s = doc2 L = urlextract(s) L[3].url = 'FOO' L[5].url = 'BAR' L[7].url = 'F00!' assert urljoin(s, L) == \ 'b' + \ '' + \ 'http://www.ignore.us/\nhttp://www.nowhere.com ' + \ '' + \ 'c' # ------------------------------------------------------------------- # Unit Test Main Routine # ------------------------------------------------------------------- def _test(): """ Unit test main routine. """ print 'Unit tests:' _test_shlex_split() print ' _shlex_split: OK' _test_tag_dict() print ' _tag_dict: OK' _test_tuple_replace() print ' _tuple_replace: OK' _test_tagextract() print ' tagextract: OK' print ' tagjoin: OK' _test_urlextract() print ' urlextract: OK' print ' urljoin: OK' if __name__ == '__main__': _test()