"""
Manipulate HTML or XHTML documents.
Version 1.0.7. This source code has been placed in the
public domain by Connelly Barnes.
Features:
- Translate HTML back and forth to data structures.
This allows you to read and write HTML documents
programmably, with much flexibility.
- Extract and modify URLs in an HTML document.
- Compatible with Python 2.0 - 2.4.
See the L{examples} for a quick start.
"""
__version__ = '1.0.7'
__all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract',
'urljoin', 'URLMatch']
# Define True and False for Python < 2.2.
import sys
if sys.version_info[:3] < (2, 2, 0):
exec "True = 1; False = 0"
# -------------------------------------------------------------------
# Globals
# -------------------------------------------------------------------
import re
import shlex
import string
import urllib
import urlparse
import types
# Translate text between these strings as plain text (not HTML).
_IGNORE_TAGS = [('script', '/script'),
('style', '/style')]
# Special tags where we have to look for _END_X as part of the
# HTML/XHTML parsing rules.
_BEGIN_COMMENT = ''
_BEGIN_CDATA = ''
# -------------------------------------------------------------------
# HTML <-> Data structure
# -------------------------------------------------------------------
def tagextract(doc):
"""
Convert HTML to data structure.
Returns a list. HTML tags become C{(name, keyword_dict)} tuples
within the list, while plain text becomes strings within the
list. All tag names are lowercased and stripped of whitespace.
Tags which end with forward slashes have a single forward slash
placed at the end of their name, to indicate that they are XML
unclosed tags.
Example:
>>> tagextract('
foo