""" Manipulate HTML or XHTML documents. Version 1.0.7. This source code has been placed in the public domain by Connelly Barnes. Features: - Translate HTML back and forth to data structures. This allows you to read and write HTML documents programmably, with much flexibility. - Extract and modify URLs in an HTML document. - Compatible with Python 2.0 - 2.4. See the L{examples} for a quick start. """ __version__ = '1.0.7' __all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract', 'urljoin', 'URLMatch'] # Define True and False for Python < 2.2. import sys if sys.version_info[:3] < (2, 2, 0): exec "True = 1; False = 0" # ------------------------------------------------------------------- # Globals # ------------------------------------------------------------------- import re import shlex import string import urllib import urlparse import types # Translate text between these strings as plain text (not HTML). _IGNORE_TAGS = [('script', '/script'), ('style', '/style')] # Special tags where we have to look for _END_X as part of the # HTML/XHTML parsing rules. _BEGIN_COMMENT = '' _BEGIN_CDATA = '' # ------------------------------------------------------------------- # HTML <-> Data structure # ------------------------------------------------------------------- def tagextract(doc): """ Convert HTML to data structure. Returns a list. HTML tags become C{(name, keyword_dict)} tuples within the list, while plain text becomes strings within the list. All tag names are lowercased and stripped of whitespace. Tags which end with forward slashes have a single forward slash placed at the end of their name, to indicate that they are XML unclosed tags. Example: >>> tagextract('

foo

') [('img', {'src': 'hi.gif', 'alt': 'hi'}), 'foo', ('br', {}), ('br/', {}), ('/body', {})] Text between C{'') [('script', {'type': 'a'}), 'var x; ', ('/script', {})] Comment strings and XML directives are rendered as a single long tag with no attributes. The case of the tag "name" is not changed: >>> tagextract('') [('!-- blah --', {})] >>> tagextract('') [('?xml version="1.0" encoding="utf-8" ?', {})] >>> tagextract('') [('!DOCTYPE html PUBLIC etc...', {})] Greater-than and less-than characters occuring inside comments or CDATA blocks are correctly kept as part of the block: >>> tagextract('') [('!-- <><><><>>..> --', {})] >>> tagextract('<>><>]<> ]]>') [('!CDATA[[><>><>]<> ]]', {})] Note that if one modifies these tags, it is important to retain the C{"--"} (for comments) or C{"]]"} (for C{CDATA}) at the end of the tag name, so that output from L{tagjoin} will be correct HTML/XHTML. """ L = _full_tag_extract(doc) for i in range(len(L)): if isinstance(L[i], _TextTag): # _TextTag object. L[i] = L[i].text else: # _HTMLTag object. L[i] = (L[i].name, L[i].attrs) return L def tagjoin(L): """ Convert data structure back to HTML. This reverses the L{tagextract} function. More precisely, if an HTML string is turned into a data structure, then back into HTML, the resulting string will be functionally equivalent to the original HTML. >>> tagjoin(tagextract(s)) (string that is functionally equivalent to s) Three changes are made to the HTML by L{tagjoin}: tags are lowercased, C{key=value} pairs are sorted, and values are placed in double-quotes. """ if not isinstance(L, types.ListType): raise ValueError('expected list argument') ans = [] for item in L: if isinstance(item, types.StringType): # Handle plain text. ans.append(item) elif item[0] == '--': # Handle closing comment. ans.append('-->') elif item[0] == '!--': # Handle opening comment. ans.append('' + ... ' end') [' blah', '', '', ' ', '', 'end'] """ s_lower = s.lower() L = [] i = 0 # Index of char being processed while i < len(s): c = s[i] if c == '<': # Left bracket, handle various cases. if s[i:i+len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT): # HTML begin comment tag, ''. i2 = s.find(_END_COMMENT, i) if i2 < 0: # No '-->'. Append the remaining malformed content and stop. L.append(s[i:]) break else: # Append the comment. L.append(s[i:i2+len(_END_COMMENT)]) i = i2 + len(_END_COMMENT) elif s[i:i+len(_BEGIN_CDATA)].startswith(_BEGIN_CDATA): # XHTML begin CDATA tag. Scan for ']]>'. i2 = s.find(_END_CDATA, i) if i2 < 0: # No ']]>'. Append the remaining malformed content and stop. L.append(s[i:]) break else: # Append the CDATA. L.append(s[i:i2+len(_END_CDATA)]) i = i2 + len(_END_CDATA) else: # Regular HTML tag. Scan for '>'. orig_i = i i2 = s.find('>', i + 1) if i2 < 0: # No end '>'. Append the rest as text. L.append(s[i:]) break else: # Append the tag. L.append(s[i:i2+1]) i = i2 + 1 # Check whether we found a special ignore tag, eg '' doc3 = '\r\t< html >< tag> ' + \ '' doc4 = '' + \ ' ] ][]]>' + \ '' # ----------------------------------------------------------------- # Test _html_split() # ----------------------------------------------------------------- s = doc1 assert s == ''.join(_html_split(s)) assert _html_split(s) == \ ['\n\n', '', '', 'Hi', '