"""
Manipulate HTML documents via data structure.
Version 1.0.0. This source code has been placed in the
public domain by Connelly Barnes.
"""
import shlex
import string
import urllib
# Translate text between these strings as plain text (not HTML).
IGNORE_TAGS = [('script', '/script'),
('style', '/style'),
('pre', '/pre')]
BEGIN_COMMENT = ''
def _ignore_tag_index(s, i):
"""
Find index within IGNORE_TAGS, or -1.
If s[i:] begins with an opening tag from IGNORE_TAGS, return the
index. Otherwise, return -1.
"""
for (j, (a, b)) in enumerate(IGNORE_TAGS):
if s[i:i+len(a)+1].lower() == '<' + a:
return j
return -1
def _html_split(s):
"""
Split string 's' into a list 'L' of tags and non-tags.
>>> html_split(' blah
', '<<<><>', '', '',\ ' ', '', 'end'] """ s_lower = s.lower() L = [] i = 0 # Index of char being processed while i < len(s): c = s[i] if c == '<': # Left bracket, handle various cases. if s[i:i+len(BEGIN_COMMENT)].startswith(BEGIN_COMMENT): # HTML begin comment tag, ''. L.append(BEGIN_COMMENT) i += len(BEGIN_COMMENT) i2 = s.find(END_COMMENT, i) if i2 < 0: # No '-->'. Append the rest as text. L.append(s[i:]) break else: # Append the comment text. L.append(s[i:i2]) # Then append the '-->' as a tag. L.append(s[i2:i2+len(END_COMMENT)]) i = i2 + len(END_COMMENT) else: # Regular HTML tag. Scan for '>'. orig_i = i i2 = s.find('>', i + 1) if i2 < 0: # No end '>'. Append the rest as text. L.append(s[i:]) break else: # Append the tag. L.append(s[i:i2+1]) i = i2 + 1 # Check whether we processed a special ignore tag, eg '
' tagi = _ignore_tag_index(s, orig_i) if tagi >= 0: # It's an ignore tag. Scan for the end tag. i2 = s_lower.find('<' + IGNORE_TAGS[tagi][1], i) if i2 < 0: # No end tag. Append the rest as text. L.append(s[i2:]) break else: # Append the text sandwiched between the tags. L.append(s[i:i2]) # Catch the closing tag with the next loop iteration. i = i2 else: # Not a left bracket, append text up to next left bracket. i2 = s.find('<', i) if i2 < 0: # No left brackets, append the rest as text. L.append(s[i:]) break else: L.append(s[i:i2]) i = i2 return L def _tag_dict(s): """ Extracts dict from an HTML tag string. >>> _tag_dict('bgcolor=#ffffff text="#000000" blink') {'bgcolor':'#ffffff', 'text':'#000000', 'blink': None} Encoded %XX hex codes in the values are unescaped. Names are lowercased. Raises ValueError for unmatched quotes and other errors. """ d = shlex.split(s) ans = {} for item in d: equals = item.find('=') if equals >= 0: (key, value) = (item[:equals].lower(), item[equals+1:]) value = urllib.unquote(value) ans[key] = value else: ans[item.lower()] = None return ans def loads(s): """ Load an HTML string into a data structure. Returns a list. HTML tags become (name, keyword_dict) tuples within the list, while plain text becomes strings within the list. All tag names are lowercased and stripped of whitespace. Tags which end with forward slashes have a single forward slash placed at the end of their name, to indicate that they are XML unclosed tags. Example: >>> loads('abcHi' assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', '!', '', '', \ ' ', 'Ho
a
Bye!') ['abc', ('body', {'bgcolor': '#ffffff'}), 'Hi', ('h1', {}), 'Ho', ('/h1', {}), ('br', {}), 'a', ('br/', {}), 'Bye!'] Text between '') [('script', {'language': 'Javascript'}), 'var x; ', ('/script', {})] Text inside the comment strings '' is also rendered as plain text. The opening and closing comments are translated into ('!--', {}) and ('--', {}), respectively. Example: >>> loads('') ['!--', ' blah ', '--'] If an HTML string is turned into a data structure, then back into HTML, the resulting string will be functionally equivalent to the original HTML. >>> dumps(loads(s)) (string that is functionally equivalent to s) Three changes are made to the HTML by dumps(): tags are lowercased, key=value pairs are sorted, and values are placed in double-quotes. """ L = _html_split(s) for (i, text) in enumerate(L): try: # Is it an HTML tag? is_tag = False if len(text) >= 2 and text[0] == '<' and text[-1] == '>': # Turn HTML tag text into (name, keyword_dict) tuple. is_tag = True elif text == BEGIN_COMMENT or text == END_COMMENT: is_tag = True if is_tag: # If an HTML tag, strip brackets and handle what's left. text = text.strip('<>') if len(text) > 0 and text[-1] == '/': rslash = True text = text[:-1] else: rslash = False first_space = text.find(' ') if first_space < 0: (name, dtext) = (text, '') else: name = text[:first_space] dtext = text[first_space+1:len(text)] name = name.strip().lower() if rslash: name += '/' dtext = dtext.strip() d = _tag_dict(dtext) L[i] = (name, d) else: # Not an HTML tag. raise ValueError except ValueError: # Leave non-HTML strings as they are. pass return L def dumps(L): """ Dump an HTML data structure into an HTML string. This reverses the loads() function. """ ans = [] for item in L: if isinstance(item, str): # Handle plain text. ans.append(item) elif item[0] == '--': # Handle closing comment. ans.append('-->') elif item[0] == '!--': # Handle opening comment. ans.append(' ' + \ ' ' + \ ' <><>><end ' + \ '' # ----------------------------------------------------------------- # Test _html_split() # ----------------------------------------------------------------- s = doc1 assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['\n\n', '', '', 'Hi', ' ', 'Ho', \ '
', '
', '
', '', \ '', '', '', '', \ '\nBye!\n'] s = doc2 assert s == ''.join(_html_split(s)) s = ' Header' + \ '
' assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', ' ', \ '', 'Header', '
'] s = ' blah okwhat hi' + \ '<><>>a' assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', ' blah ok ', '', \ ' what', ' ', 'hi<><>>', '', \ '', 'a'] s = '!-
', '', \ ''] # ----------------------------------------------------------------- # Test loads() and dumps() # ----------------------------------------------------------------- s = doc1 assert loads('') == [] assert loads(s) == \ ['\n\n', ('html', {}), ('body', {'bgcolor': '#ffffff'}), \ 'Hi', ('h1', {}), 'Ho', ('/h1', {}), ('br', {}), \ ('br/', {}), ('img', {'src': 'text_.gif'}), \ ('tag', {'noshow': None}), ('img/', {'test': '5\xff'}), \ ('/body', {}), ('/html', {}), '\nBye!\n'] s2 = '\n\nHi', ' ', ' ', \ ' ', '', '-', '
<><>><
<><>><' + \ '