""" Manipulate HTML documents via data structure. Version 1.0.0. This source code has been placed in the public domain by Connelly Barnes. """ import shlex import string import urllib # Translate text between these strings as plain text (not HTML). IGNORE_TAGS = [('script', '/script'), ('style', '/style'), ('pre', '/pre')] BEGIN_COMMENT = '' def _ignore_tag_index(s, i): """ Find index within IGNORE_TAGS, or -1. If s[i:] begins with an opening tag from IGNORE_TAGS, return the index. Otherwise, return -1. """ for (j, (a, b)) in enumerate(IGNORE_TAGS): if s[i:i+len(a)+1].lower() == '<' + a: return j return -1 def _html_split(s): """ Split string 's' into a list 'L' of tags and non-tags. >>> html_split(' blah more ') [' blah ', '', ' more ', '', ' '] Tags begin with '<' and end with '>'. Also, ''.join(L) == s. Special exceptions: 'end') [' blah', '
', '<<<><>', '
', '',\ ' ', '', 'end'] """ s_lower = s.lower() L = [] i = 0 # Index of char being processed while i < len(s): c = s[i] if c == '<': # Left bracket, handle various cases. if s[i:i+len(BEGIN_COMMENT)].startswith(BEGIN_COMMENT): # HTML begin comment tag, ''. L.append(BEGIN_COMMENT) i += len(BEGIN_COMMENT) i2 = s.find(END_COMMENT, i) if i2 < 0: # No '-->'. Append the rest as text. L.append(s[i:]) break else: # Append the comment text. L.append(s[i:i2]) # Then append the '-->' as a tag. L.append(s[i2:i2+len(END_COMMENT)]) i = i2 + len(END_COMMENT) else: # Regular HTML tag. Scan for '>'. orig_i = i i2 = s.find('>', i + 1) if i2 < 0: # No end '>'. Append the rest as text. L.append(s[i:]) break else: # Append the tag. L.append(s[i:i2+1]) i = i2 + 1 # Check whether we processed a special ignore tag, eg '
'
        tagi = _ignore_tag_index(s, orig_i)
        if tagi >= 0:
          # It's an ignore tag.  Scan for the end tag.
          i2 = s_lower.find('<' + IGNORE_TAGS[tagi][1], i)
          if i2 < 0:
            # No end tag.  Append the rest as text.
            L.append(s[i2:])
            break
          else:
            # Append the text sandwiched between the tags.
            L.append(s[i:i2])
            # Catch the closing tag with the next loop iteration.
            i = i2
    else:
      # Not a left bracket, append text up to next left bracket.
      i2 = s.find('<', i)
      if i2 < 0:
        # No left brackets, append the rest as text.
        L.append(s[i:])
        break
      else:
        L.append(s[i:i2])
      i = i2

  return L

def _tag_dict(s):
  """
  Extracts dict from an HTML tag string.

  >>> _tag_dict('bgcolor=#ffffff text="#000000" blink')
  {'bgcolor':'#ffffff', 'text':'#000000', 'blink': None}

  Encoded %XX hex codes in the values are unescaped.  Names
  are lowercased.

  Raises ValueError for unmatched quotes and other errors.
  """
  d = shlex.split(s)
  ans = {}
  for item in d:
    equals = item.find('=')
    if equals >= 0:
      (key, value) = (item[:equals].lower(), item[equals+1:])
      value = urllib.unquote(value)
      ans[key] = value
    else:
      ans[item.lower()] = None
  return ans

def loads(s):
  """
  Load an HTML string into a data structure.

  Returns a list.  HTML tags become (name, keyword_dict) tuples
  within the list, while plain text becomes strings within the
  list.  All tag names are lowercased and stripped of whitespace.
  Tags which end with forward slashes have a single forward slash
  placed at the end of their name, to indicate that they are XML
  unclosed tags.

  Example:

  >>> loads('abcHi

Ho


a
Bye!') ['abc', ('body', {'bgcolor': '#ffffff'}), 'Hi', ('h1', {}), 'Ho', ('/h1', {}), ('br', {}), 'a', ('br/', {}), 'Bye!'] Text between '') [('script', {'language': 'Javascript'}), 'var x; ', ('/script', {})] Text inside the comment strings '' is also rendered as plain text. The opening and closing comments are translated into ('!--', {}) and ('--', {}), respectively. Example: >>> loads('') ['!--', ' blah ', '--'] If an HTML string is turned into a data structure, then back into HTML, the resulting string will be functionally equivalent to the original HTML. >>> dumps(loads(s)) (string that is functionally equivalent to s) Three changes are made to the HTML by dumps(): tags are lowercased, key=value pairs are sorted, and values are placed in double-quotes. """ L = _html_split(s) for (i, text) in enumerate(L): try: # Is it an HTML tag? is_tag = False if len(text) >= 2 and text[0] == '<' and text[-1] == '>': # Turn HTML tag text into (name, keyword_dict) tuple. is_tag = True elif text == BEGIN_COMMENT or text == END_COMMENT: is_tag = True if is_tag: # If an HTML tag, strip brackets and handle what's left. text = text.strip('<>') if len(text) > 0 and text[-1] == '/': rslash = True text = text[:-1] else: rslash = False first_space = text.find(' ') if first_space < 0: (name, dtext) = (text, '') else: name = text[:first_space] dtext = text[first_space+1:len(text)] name = name.strip().lower() if rslash: name += '/' dtext = dtext.strip() d = _tag_dict(dtext) L[i] = (name, d) else: # Not an HTML tag. raise ValueError except ValueError: # Leave non-HTML strings as they are. pass return L def dumps(L): """ Dump an HTML data structure into an HTML string. This reverses the loads() function. """ ans = [] for item in L: if isinstance(item, str): # Handle plain text. ans.append(item) elif item[0] == '--': # Handle closing comment. ans.append('-->') elif item[0] == '!--': # Handle opening comment. ans.append('' + \ '' + \ '
<><>><
end ' + \ '' # ----------------------------------------------------------------- # Test _html_split() # ----------------------------------------------------------------- s = doc1 assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['\n\n', '', '', 'Hi', '

', 'Ho', \ '

', '
', '
', '', \ '', '', '', '', \ '\nBye!\n'] s = doc2 assert s == ''.join(_html_split(s)) s = '

Header' + \ '

' assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', ' ', \ '

', 'Header', '

'] s = ' blah ok what
hi' +   \
      '<><>>
a' assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', ' blah ok ', '', \ ' what', '
', 'hi<><>>', '
', \ '', 'a'] s = '!
  -
' assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['', '!', '', '', \ ' ', '
', '', ' ', '',      \
   ' ', '', '-', '
', \ ''] # ----------------------------------------------------------------- # Test loads() and dumps() # ----------------------------------------------------------------- s = doc1 assert loads('') == [] assert loads(s) == \ ['\n\n', ('html', {}), ('body', {'bgcolor': '#ffffff'}), \ 'Hi', ('h1', {}), 'Ho', ('/h1', {}), ('br', {}), \ ('br/', {}), ('img', {'src': 'text_.gif'}), \ ('tag', {'noshow': None}), ('img/', {'test': '5\xff'}), \ ('/body', {}), ('/html', {}), '\nBye!\n'] s2 = '\n\nHi

Ho


' + \ '
' + \ '\nBye!\n' assert dumps(loads(s)) == s2 doc2 = '\r' + \ '' + \ '
<><>><
end ' + \ '' s = doc2 assert loads(s) == \ ['\r', ('html', {}), ('!--', {}), ' Comment ', \ ('--', {}), ('hiya', {}), ('foo', {}), \ ('test', {'content': '6', 'tag': '5'}), \ ('is', {'broken': 'False'}), ('yay', {}), ('pre', {}), '<><>><', \ ('/pre', {}), ('foo', {'bar': '5'}), 'end', ('!--', {}), \ ' !_-', {}), \ ('/script', {})] assert dumps(loads(s)) == \ '\r
<><>><
' + \ 'end ' + \ '' print 'Unit test passed.' if __name__ == '__main__': test()