"""
Manipulate HTML or XHTML documents.
Version 1.0.2. This source code has been placed in the
public domain by Connelly Barnes.
Features:
- Translate HTML back and forth to data structures.
This allows you to read and write HTML documents
programmably, with much flexibility.
- Extract and modify URLs in an HTML document.
"""
# -------------------------------------------------------------------
# Globals
# -------------------------------------------------------------------
import re
import shlex
import string
import urllib
import urlparse
# Translate text between these strings as plain text (not HTML).
_IGNORE_TAGS = [('script', '/script'),
('style', '/style')]
_BEGIN_COMMENT = ''
# -------------------------------------------------------------------
# HTML <-> Data structure
# -------------------------------------------------------------------
def tagextract(doc):
"""
Convert HTML to data structure.
Returns a list. HTML tags become (name, keyword_dict) tuples
within the list, while plain text becomes strings within the
list. All tag names are lowercased and stripped of whitespace.
Tags which end with forward slashes have a single forward slash
placed at the end of their name, to indicate that they are XML
unclosed tags.
Example:
>>> tagextract('
foo