""" Manipulate HTML or XHTML documents. Version 1.0.8. This source code has been placed in the public domain by Connelly Barnes. Features: - Translate HTML back and forth to data structures. This allows you to read and write HTML documents programmably, with much flexibility. - Extract and modify URLs in an HTML document. - Compatible with Python 2.0 - 2.4. See the L{examples} for a quick start. """ __version__ = '1.0.8' __all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract', 'urljoin', 'URLMatch'] # Define True and False for Python < 2.2. import sys if sys.version_info[:3] < (2, 2, 0): exec "True = 1; False = 0" # ------------------------------------------------------------------- # Globals # ------------------------------------------------------------------- import re import shlex import string import urllib import urlparse import types # Translate text between these strings as plain text (not HTML). _IGNORE_TAGS = [('script', '/script'), ('style', '/style')] # Special tags where we have to look for _END_X as part of the # HTML/XHTML parsing rules. _BEGIN_COMMENT = '' _BEGIN_CDATA = '' # ------------------------------------------------------------------- # HTML <-> Data structure # ------------------------------------------------------------------- def tagextract(doc): """ Convert HTML to data structure. Returns a list. HTML tags become C{(name, keyword_dict)} tuples within the list, while plain text becomes strings within the list. All tag names are lowercased and stripped of whitespace. Tags which end with forward slashes have a single forward slash placed at the end of their name, to indicate that they are XML unclosed tags. Example: >>> tagextract('hifoo

') [('img', {'src': 'hi.gif', 'alt': 'hi'}), 'foo', ('br', {}), ('br/', {}), ('/body', {})] Text between C{'') [('script', {'type': 'a'}), 'var x; ', ('/script', {})] Comment strings and XML directives are rendered as a single long tag with no attributes. The case of the tag "name" is not changed: >>> tagextract('') [('!-- blah --', {})] >>> tagextract('') [('?xml version="1.0" encoding="utf-8" ?', {})] >>> tagextract('') [('!DOCTYPE html PUBLIC etc...', {})] Greater-than and less-than characters occuring inside comments or CDATA blocks are correctly kept as part of the block: >>> tagextract('') [('!-- <><><><>>..> --', {})] >>> tagextract('<>><>]<> ]]>') [('!CDATA[[><>><>]<> ]]', {})] Note that if one modifies these tags, it is important to retain the C{"--"} (for comments) or C{"]]"} (for C{CDATA}) at the end of the tag name, so that output from L{tagjoin} will be correct HTML/XHTML. """ L = _full_tag_extract(doc) for i in range(len(L)): if isinstance(L[i], _TextTag): # _TextTag object. L[i] = L[i].text else: # _HTMLTag object. L[i] = (L[i].name, L[i].attrs) return L def tagjoin(L): """ Convert data structure back to HTML. This reverses the L{tagextract} function. More precisely, if an HTML string is turned into a data structure, then back into HTML, the resulting string will be functionally equivalent to the original HTML. >>> tagjoin(tagextract(s)) (string that is functionally equivalent to s) Three changes are made to the HTML by L{tagjoin}: tags are lowercased, C{key=value} pairs are sorted, and values are placed in double-quotes. """ if not isinstance(L, types.ListType): raise ValueError('expected list argument') ans = [] for item in L: if isinstance(item, types.StringType): # Handle plain text. ans.append(item) elif item[0] == '--': # Handle closing comment. ans.append('-->') elif item[0] == '!--': # Handle opening comment. ans.append('' + ... ' end') [' blah', '', '', ' ', '', 'end'] """ s_lower = s.lower() L = [] i = 0 # Index of char being processed while i < len(s): c = s[i] if c == '<': # Left bracket, handle various cases. if s[i:i+len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT): # HTML begin comment tag, ''. i2 = s.find(_END_COMMENT, i) if i2 < 0: # No '-->'. Append the remaining malformed content and stop. L.append(s[i:]) break else: # Append the comment. L.append(s[i:i2+len(_END_COMMENT)]) i = i2 + len(_END_COMMENT) elif s[i:i+len(_BEGIN_CDATA)].startswith(_BEGIN_CDATA): # XHTML begin CDATA tag. Scan for ']]>'. i2 = s.find(_END_CDATA, i) if i2 < 0: # No ']]>'. Append the remaining malformed content and stop. L.append(s[i:]) break else: # Append the CDATA. L.append(s[i:i2+len(_END_CDATA)]) i = i2 + len(_END_CDATA) else: # Regular HTML tag. Scan for '>'. orig_i = i i2 = s.find('>', i + 1) if i2 < 0: # No end '>'. Append the rest as text. L.append(s[i:]) break else: # Append the tag. L.append(s[i:i2+1]) i = i2 + 1 # Check whether we found a special ignore tag, eg '' doc3 = '\r\t< html >< tag> ' + \ '' doc4 = '' + \ ' ] ][]]>' + \ '' # ----------------------------------------------------------------- # Test _html_split() # ----------------------------------------------------------------- s = doc1 assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['\n\n', '', '', 'Hi', '

', 'Ho', \ '

', '
', '
', '', \ '', '', '', '', \ '\nBye!\n'] s = doc2 assert s == ''.join(_html_split(s)) # Test single quotes s = doc2.replace('"', "'") assert s == ''.join(_html_split(s)) s = '

Header' + \ '

' assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', ' ', \ '

', 'Header', '

'] s = ' blah ok whata' assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', ' blah ok ', '', \ ' what', '', \ '', 'a'] s = '! -' + \ '' assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', '!', '', '', \ ' ', '', ' ', '', \ ' ', '', '-', '', \ ''] s = doc4 assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', '', '', '', \ ' ] ][]]>', '', \ '', 'Hi

Ho


' + \ '
' + \ '\nBye!\n' assert tagjoin(tagextract(s)) == s2 doc2old = doc2 doc2 = '\r' + \ '' + \ 'end '+ \ '' assert doc2old == doc2 s = doc2 assert tagextract(s) == \ ['\r', ('html', {}), ('!-- Comment
--', {}), \ ('hiya', {}), ('foo', {}), \ ('test', {'content': '6', 'tag': '5'}), \ ('is', {'broken': 'False'}), ('yay', {}), ('style', {}), '<><>><', \ ('/style', {}), ('foo', {'bar': '5'}), 'end', \ ('!-- !_-', {}), \ ('/script', {})] assert tagjoin(tagextract(s)) == \ '\rend ' + \ '' s = doc5 assert tagextract(s) == \ [('a', {'href':'foobar/ \t=', 'base':'10', 'x':'15'}), \ ('a', {'x':'9', 't':'20'})] assert tagjoin(tagextract(s)) == \ '' # ----------------------------------------------------------------- # Test _full_tag_extract() # ----------------------------------------------------------------- for s in [doc1, doc2, doc3, doc1.replace('"', "'"), doc2.replace('"', "'"), doc3.replace('"', "'")]: L = _full_tag_extract(s) for (i, item) in _enumerate(L): if isinstance(item, _HTMLTag): for key in item.attrs.keys(): assert s[item.key_pos[key][0]:item.key_pos[key][1]].lower()\ == key if item.attrs[key] != None: assert s[item.value_pos[key][0]:item.value_pos[key][1]] \ == item.attrs[key] n = 1000 doc4 = ''*n L = tagextract(doc4) assert len(L) == n for i in range(n): assert L[i] == ('tag/',{'name':'5','value':'6afdjherknc4 cdk j', \ 'a':'7', 'b':'8'}) # ----------------------------------------------------------------- # Test tagextract() and tagjoin() with XML directives. # ----------------------------------------------------------------- doc1 = \ 'a' + \ 'bc' + \ '' + \ 'zrx' + \ 'tt' doc1join = \ 'abczrxtt' ans1 = \ ['a', ('?xml version="1.0"?', {}), 'b', \ ('!DOCTYPE html' + \ 'PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' + \ '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"', {}),\ 'c', ('html', {'a':'b'}), ('!-- Comment <><> hi! --', {}), 'z', \ ('![CDATA[ some content ]]', {}), 'rx', \ ('![C[DATA[ more and weirder ] ][]]', {}), 'tt'] assert tagextract('') == \ [('?xml version="1.0" encoding="utf-8" ?', {})] assert tagextract('') == \ [('!DOCTYPE html PUBLIC etc...', {})] assert tagextract(doc1) == ans1 assert tagjoin(tagextract(doc1)) == doc1join # ------------------------------------------------------------------- # Unit Tests: URL Parsing # ------------------------------------------------------------------- def _test_urlextract(): """ Unit tests for L{urlextract} and L{urljoin}. """ doc1 = 'urlblah, url ( blah2, url( blah3) url(blah4) ' + \ 'url("blah5") hum("blah6") url)"blah7"( url ( " blah8 " );;' doc2 = 'b' + \ 'http://www.ignore.us/' + \ '\nhttp://www.nowhere.com c' doc3 = '@import foo;\n@import bar\n@import url(\'foo2\');' + \ '@import url(\'http://bar2\')\n@import\turl("foo!");' + \ '@import \'foo3\'\n@import "bar3";\n@importfails;' + \ '@import;@import\n;url(\'howdy!\')\n@import foo5 ;' + \ '@import \'foo6\' \n@import "foo7";' doc4 = '@import foo handheld;\n@import \'bar\' handheld\n' + \ '@import url(\'foo2\') handheld; @import url(bar2) ha\n' + \ '@import url("foo3") handheld\n' doc5 = 'b' + \ '' doc6 = doc2.replace('"', "'") # Test single quotes, too. # Test CSS. s = doc1 L = urlextract(s, mimetype='text/css') L2 = [x.url for x in L] assert L2 == [' blah3', 'blah4', 'blah5', ' blah8 '] assert [s[x.start:x.end] == x.url for x in L].count(False) == 0 # Test CSS more. s = doc3 L = urlextract(s, mimetype='text/css') L2 = [x.url for x in L] assert L2 == ['foo', 'bar', 'foo2', 'http://bar2', 'foo!', \ 'foo3', 'bar3', 'howdy!', 'foo5', 'foo6', 'foo7'] assert [s[x.start:x.end] == x.url for x in L].count(False) == 0 # Test CSS even more. s = doc4 L = urlextract(s, mimetype='text/css') L2 = [x.url for x in L] assert L2 == ['foo', 'bar', 'foo2', 'bar2', 'foo3'] assert [s[x.start:x.end] == x.url for x in L].count(False) == 0 # Test HTML. s = doc2 L = urlextract(s) L2 = [x.url for x in L] L3 = [x.url for x in urlextract(doc6)] ans = ['a.gif', 'b.html', './c.png', \ 'http://www.abc.edu/d.tga', 'h.gif', \ 'http://www.testdomain.com/', 'a.gif', '/i.png'] assert L2 == L3 == ans for i in range(len(L)): assert s[L[i].start:L[i].end] == L[i].url # Test HTML more. n = 100 s2 = s * n L3 = urlextract(s2) L4 = [x.url for x in L3] assert L4 == L2 * n for i in range(len(L3)): assert s2[L3[i].start:L3[i].end] == L3[i].url # Test HTML w/ siteurl. base = 'http://www.python.org/~guido/' L = urlextract(s, base) L2 = [x.url for x in L] assert L2 == [urlparse.urljoin(base, x) for x in ans] # Test urljoin(). assert urljoin(doc1, urlextract(doc1, mimetype='text/css')) == doc1 assert urljoin(doc2, urlextract(doc2)) == doc2 s = doc2 L = urlextract(s) L[3].url = 'FOO' L[5].url = 'BAR' L[7].url = 'F00!' assert urljoin(s, L) == \ 'b' + \ '' + \ 'http://www.ignore.us/\nhttp://www.nowhere.com ' + \ '' + \ 'c' # Test HTML yet more. s = doc5 L = urlextract(s) L2 = [x.url for x in L] assert L2 == ['foo', 'a.gif', 'bar.css', 'b.html'] assert [s[x.start:x.end] == x.url for x in L].count(False) == 0 # ------------------------------------------------------------------- # Unit Test Main Routine # ------------------------------------------------------------------- def _test(): """ Unit test main routine. """ print 'Unit tests:' _test_remove_comments() print ' _remove_comments: OK' _test_shlex_split() print ' _shlex_split: OK' _test_tag_dict() print ' _tag_dict: OK' _test_tuple_replace() print ' _tuple_replace: OK' _test_tagextract() print ' tagextract: OK' print ' tagjoin: OK' _test_urlextract() print ' urlextract: OK' print ' urljoin: OK' if __name__ == '__main__': _test()