""" Manipulate HTML documents via data structure. Version 1.0.0. This source code has been placed in the public domain by Connelly Barnes. """ import shlex import string import urllib # Translate text between these strings as plain text (not HTML). IGNORE_TAGS = [('script', '/script'), ('style', '/style'), ('pre', '/pre')] BEGIN_COMMENT = '' def _ignore_tag_index(s, i): """ Find index within IGNORE_TAGS, or -1. If s[i:] begins with an opening tag from IGNORE_TAGS, return the index. Otherwise, return -1. """ for (j, (a, b)) in enumerate(IGNORE_TAGS): if s[i:i+len(a)+1].lower() == '<' + a: return j return -1 def _html_split(s): """ Split string 's' into a list 'L' of tags and non-tags. >>> html_split(' blah more ') [' blah ', '', ' more ', '', ' '] Tags begin with '<' and end with '>'. Also, ''.join(L) == s. Special exceptions: 'end') [' blah', '

', '<<<><>', '

', '',\ ' ', '', 'end'] """ s_lower = s.lower() L = [] i = 0 # Index of char being processed while i < len(s): c = s[i] if c == '<': # Left bracket, handle various cases. if s[i:i+len(BEGIN_COMMENT)].startswith(BEGIN_COMMENT): # HTML begin comment tag, ''. L.append(BEGIN_COMMENT) i += len(BEGIN_COMMENT) i2 = s.find(END_COMMENT, i) if i2 < 0: # No '-->'. Append the rest as text. L.append(s[i:]) break else: # Append the comment text. L.append(s[i:i2]) # Then append the '-->' as a tag. L.append(s[i2:i2+len(END_COMMENT)]) i = i2 + len(END_COMMENT) else: # Regular HTML tag. Scan for '>'. orig_i = i i2 = s.find('>', i + 1) if i2 < 0: # No end '>'. Append the rest as text. L.append(s[i:]) break else: # Append the tag. L.append(s[i:i2+1]) i = i2 + 1 # Check whether we processed a special ignore tag, eg '

'
        tagi = _ignore_tag_index(s, orig_i)
        if tagi >= 0:
          # It's an ignore tag.  Scan for the end tag.
          i2 = s_lower.find('<' + IGNORE_TAGS[tagi][1], i)
          if i2 < 0:
            # No end tag.  Append the rest as text.
            L.append(s[i2:])
            break
          else:
            # Append the text sandwiched between the tags.
            L.append(s[i:i2])
            # Catch the closing tag with the next loop iteration.
            i = i2
    else:
      # Not a left bracket, append text up to next left bracket.
      i2 = s.find('<', i)
      if i2 < 0:
        # No left brackets, append the rest as text.
        L.append(s[i:])
        break
      else:
        L.append(s[i:i2])
      i = i2

  return L

def _tag_dict(s):
  """
  Extracts dict from an HTML tag string.

  >>> _tag_dict('bgcolor=#ffffff text="#000000" blink')
  {'bgcolor':'#ffffff', 'text':'#000000', 'blink': None}

  Encoded %XX hex codes in the values are unescaped.  Names
  are lowercased.

  Raises ValueError for unmatched quotes and other errors.
  """
  d = shlex.split(s)
  ans = {}
  for item in d:
    equals = item.find('=')
    if equals >= 0:
      (key, value) = (item[:equals].lower(), item[equals+1:])
      value = urllib.unquote(value)
      ans[key] = value
    else:
      ans[item.lower()] = None
  return ans

def loads(s):
  """
  Load an HTML string into a data structure.

  Returns a list.  HTML tags become (name, keyword_dict) tuples
  within the list, while plain text becomes strings within the
  list.  All tag names are lowercased and stripped of whitespace.
  Tags which end with forward slashes have a single forward slash
  placed at the end of their name, to indicate that they are XML
  unclosed tags.

  Example:

  >>> loads('abcHiHo

a
Bye!')
  ['abc', ('body', {'bgcolor': '#ffffff'}), 'Hi', ('h1', {}),
  'Ho', ('/h1', {}), ('br', {}), 'a', ('br/', {}), 'Bye!']

  Text between '')
  [('script', {'language': 'Javascript'}), 'var x; ',
   ('/script', {})]

  Text inside the comment strings '' is also rendered
  as plain text.  The opening and closing comments are translated
  into ('!--', {}) and ('--', {}), respectively.

  Example:

  >>> loads('')
  ['!--', ' blah ', '--']

  If an HTML string is turned into a data structure, then back into
  HTML, the resulting string will be functionally equivalent to the
  original HTML.

  >>> dumps(loads(s))
  (string that is functionally equivalent to s)

  Three changes are made to the HTML by dumps(): tags are lowercased,
  key=value pairs are sorted, and values are placed in double-quotes.

  """
  L = _html_split(s)
  for (i, text) in enumerate(L):
    try:

      # Is it an HTML tag?
      is_tag = False
      if len(text) >= 2 and text[0] == '<' and text[-1] == '>':
        # Turn HTML tag text into (name, keyword_dict) tuple.
        is_tag = True
      elif text == BEGIN_COMMENT or text == END_COMMENT:
        is_tag = True

      if is_tag:
        # If an HTML tag, strip brackets and handle what's left.
        text = text.strip('<>')
        if len(text) > 0 and text[-1] == '/':
          rslash = True
          text = text[:-1]
        else:
          rslash = False
        first_space = text.find(' ')
        if first_space < 0:
          (name, dtext) = (text, '')
        else:
          name  = text[:first_space]
          dtext = text[first_space+1:len(text)]

        name  = name.strip().lower()
        if rslash:
          name += '/'
        dtext = dtext.strip()

        d = _tag_dict(dtext)
        L[i] = (name, d)
      else:
        # Not an HTML tag.
        raise ValueError
    except ValueError:
      # Leave non-HTML strings as they are.
      pass
  return L

def dumps(L):
  """
  Dump an HTML data structure into an HTML string.

  This reverses the loads() function.
  """
  ans = []
  for item in L:
    if isinstance(item, str):
      # Handle plain text.
      ans.append(item)
    elif item[0] == '--':
      # Handle closing comment.
      ans.append('-->')
    elif item[0] == '!--':
      # Handle opening comment.
      ans.append('' +      \
         ''     +      \
         '<><>><
end ' +    \
         ''

  # -----------------------------------------------------------------
  # Test _html_split()
  # -----------------------------------------------------------------

  s = doc1
  assert s == ''.join(_html_split(s))
  assert _html_split(s) ==                                           \
  ['\n\n', '', '', 'Hi', '', 'Ho',   \
   '
', '
', '
', '',             \
   '', '', '', '',      \
   '\nBye!\n']

  s = doc2
  assert s == ''.join(_html_split(s))

  s = ' Header' +       \
      ''
  assert s == ''.join(_html_split(s))
  assert _html_split(s) ==                                           \
  ['', ' ',         \
   '', 'Header', '']

  s = ' blah ok  whathi' +   \
      '<><>>
a'
  assert s == ''.join(_html_split(s))
  assert _html_split(s) ==                                           \
  ['', ' blah ok ', '',   \
   ' what', '', 'hi<><>>', '',                            \
   '', 'a']

  s = '!   -'
  assert s == ''.join(_html_split(s))
  assert _html_split(s) ==                                           \
  ['', '!', '', '',    \
   ' ', '', '', ' ', '',      \
   ' ', '', '-', '',     \
   '']

  # -----------------------------------------------------------------
  # Test loads() and dumps()
  # -----------------------------------------------------------------

  s = doc1
  assert loads('') == []
  assert loads(s) ==                                                 \
         ['\n\n', ('html', {}), ('body', {'bgcolor': '#ffffff'}),    \
          'Hi', ('h1', {}), 'Ho', ('/h1', {}), ('br', {}),           \
          ('br/', {}), ('img', {'src': 'text_.gif'}),                \
          ('tag', {'noshow': None}), ('img/', {'test': '5\xff'}),    \
          ('/body', {}), ('/html', {}), '\nBye!\n']
  s2 = '\n\nHiHo

' +     \
       '
' +                   \
       '\nBye!\n'
  assert dumps(loads(s)) == s2


  doc2 = '\r' +      \
         ''     +      \
         '<><>><
end ' +    \
         ''

  s = doc2
  assert loads(s) ==                                                 \
  ['\r', ('html', {}), ('!--', {}), ' Comment ',      \
  ('--', {}), ('hiya', {}), ('foo', {}),                             \
  ('test', {'content': '6', 'tag': '5'}),                            \
  ('is', {'broken': 'False'}), ('yay', {}), ('pre', {}), '<><>><',   \
  ('/pre', {}), ('foo', {'bar': '5'}), 'end', ('!--', {}),           \
  ' !_-', {}),    \
  ('/script', {})]

  assert dumps(loads(s)) ==                                          \
  '\r<><>><' +  \
  'end '                         +  \
  ''

  print 'Unit test passed.'

if __name__ == '__main__':
  test()

Ho

', 'Ho', \ '

Header' + \ '

', 'Header', '

Ho