diff --git a/NOTICE b/NOTICE index a2f4aa20..8506656b 100644 --- a/NOTICE +++ b/NOTICE @@ -82,41 +82,6 @@ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -doc/sitemap_gen.py ------------------- -Under BSD License : - -Copyright (c) 2004, 2005, Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of Google Inc. nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - util/unlink.py -------------- diff --git a/docs/site/sitemap.xml b/docs/site/sitemap.xml index 6a8e43a0..6411a24e 100644 --- a/docs/site/sitemap.xml +++ b/docs/site/sitemap.xml @@ -1,112 +1,73 @@ - - + + http://gunicorn.org/ - 2010-07-01T05:14:22Z - 0.5000 + 2019-11-27T00:02:48+01:00 + 1.0 + + + http://gunicorn.org/community.html + 2012-10-04T00:43:15+05:45 + 0.5 http://gunicorn.org/configuration.html - 2010-07-01T05:14:22Z - 0.5000 + 2012-10-04T00:43:15+05:45 + 0.5 http://gunicorn.org/configure.html - 2010-07-01T05:14:22Z - 0.5000 - - - http://gunicorn.org/css/ - 2010-07-01T05:14:22Z - 0.5000 - - - http://gunicorn.org/css/index.css - 2010-07-01T05:14:22Z - 0.5000 - - - http://gunicorn.org/css/style.css - 2010-07-01T05:14:22Z - 0.5000 + 2012-10-04T00:43:15+05:45 + 0.5 http://gunicorn.org/deploy.html - 2010-07-01T05:14:22Z - 0.5000 + 2012-10-04T00:43:15+05:45 + 0.5 http://gunicorn.org/deployment.html - 2010-07-01T05:14:22Z - 0.5000 + 2012-10-04T00:43:15+05:45 + 0.5 http://gunicorn.org/design.html - 2010-07-01T05:14:22Z - 0.5000 + 2012-10-04T00:43:15+05:45 + 0.5 http://gunicorn.org/faq.html - 2010-07-01T05:14:22Z - 0.5000 - - - http://gunicorn.org/images/ - 2010-07-01T05:14:22Z - 0.5000 - - - http://gunicorn.org/images/gunicorn.png - 2010-07-01T05:14:22Z - 0.5000 - - - http://gunicorn.org/images/large_gunicorn.png - 2010-07-01T05:14:22Z - 0.5000 - - - http://gunicorn.org/images/logo.png - 2010-07-01T05:14:22Z - 0.5000 - - - http://gunicorn.org/index.html - 2010-07-01T05:14:22Z - 0.5000 + 2012-10-04T00:43:15+05:45 + 0.5 http://gunicorn.org/install.html - 2010-07-01T05:14:22Z - 0.5000 + 2012-10-04T00:43:15+05:45 + 0.5 http://gunicorn.org/installation.html - 2010-07-01T05:14:22Z - 0.5000 + 2012-10-04T00:43:15+05:45 + 0.5 http://gunicorn.org/news.html - 2010-07-08T19:57:19Z - 0.5000 + 2012-10-04T00:43:15+05:45 + 0.5 http://gunicorn.org/run.html - 2010-07-01T05:14:22Z - 0.5000 + 2012-10-04T00:43:15+05:45 + 0.5 http://gunicorn.org/tuning.html - 2010-07-01T05:14:22Z - 0.5000 + 2012-10-04T00:43:15+05:45 + 0.5 http://gunicorn.org/usage.html - 2010-07-01T05:14:22Z - 0.5000 + 2012-10-04T00:43:15+05:45 + 0.5 diff --git a/docs/sitemap_config.xml b/docs/sitemap_config.xml deleted file mode 100644 index 513d19bb..00000000 --- a/docs/sitemap_config.xml +++ /dev/null @@ -1,19 +0,0 @@ - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/sitemap_gen.py b/docs/sitemap_gen.py old mode 100755 new mode 100644 index 1cfbbae1..29c7ca02 --- a/docs/sitemap_gen.py +++ b/docs/sitemap_gen.py @@ -1,2188 +1,41 @@ -#!/usr/bin/env python -# -# Copyright (c) 2004, 2005 Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# -# * Neither the name of Google nor the names of its contributors may -# be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# -# -# The sitemap_gen.py script is written in Python 2.2 and released to -# the open source community for continuous improvements under the BSD -# 2.0 new license, which can be found at: -# -# http://www.opensource.org/licenses/bsd-license.php -# - -__usage__ = \ -"""A simple script to automatically produce sitemaps for a webserver, -in the Google Sitemap Protocol (GSP). - -Usage: python sitemap_gen.py --config=config.xml [--help] [--testing] - --config=config.xml, specifies config file location - --help, displays usage message - --testing, specified when user is experimenting -""" - -import fnmatch -import glob -import gzip -import hashlib import os -import re -import stat -import time -import types -import urllib -import urlparse -import xml.sax +import subprocess +from xml.etree import ElementTree + + +def main(): + generate( + site_path=os.path.join(os.path.dirname(__file__), 'site'), + special_priorities={'index.html': 1.0}) + + +def generate(site_path, special_priorities, directory_index='index.html'): + urlset = ElementTree.Element('urlset', xmlns='http://www.sitemaps.org/schemas/sitemap/0.9') + urlset.text = '\n ' + for root, dirs, filenames in os.walk(site_path): + for filename in filenames: + if filename.endswith('.html'): + absolute_filepath = os.path.join(root, filename) + relative_path = os.path.relpath(absolute_filepath, site_path) + relative_url = os.path.dirname(relative_path) if filename == directory_index else relative_path + last_modification = subprocess.check_output( + ['git', 'log', '-1', '--pretty="%cI"', absolute_filepath]).decode('ascii').strip('\n"') + url_element = ElementTree.SubElement(urlset, 'url') + loc_element = ElementTree.SubElement(url_element, 'loc') + loc_element.text = 'http://gunicorn.org/' + relative_url + lastmod_element = ElementTree.SubElement(url_element, 'lastmod') + lastmod_element.text = last_modification + priority_element = ElementTree.SubElement(url_element, 'priority') + priority_element.text = str(special_priorities.get(relative_path, 0.5)) + url_element.tail = priority_element.tail = '\n ' + url_element.text = loc_element.tail = lastmod_element.tail = '\n ' + # We sort the url nodes instead of the filenames because + # filenames might be altered by the directory_index option + urlset[:] = sorted([url for url in urlset], key=lambda url: url[0].text) + urlset.tail = urlset[-1].tail = '\n' + with open(os.path.join(site_path, 'sitemap.xml'), 'wb') as sitemap_file: + ElementTree.ElementTree(urlset).write(sitemap_file, encoding='UTF-8', xml_declaration=True) -# Text encodings -ENC_ASCII = 'ASCII' -ENC_UTF8 = 'UTF-8' -ENC_IDNA = 'IDNA' -ENC_ASCII_LIST = ['ASCII', 'US-ASCII', 'US', 'IBM367', 'CP367', 'ISO646-US' - 'ISO_646.IRV:1991', 'ISO-IR-6', 'ANSI_X3.4-1968', - 'ANSI_X3.4-1986', 'CPASCII' ] -ENC_DEFAULT_LIST = ['ISO-8859-1', 'ISO-8859-2', 'ISO-8859-5'] - -# Maximum number of urls in each sitemap, before next Sitemap is created -MAXURLS_PER_SITEMAP = 50000 - -# Suffix on a Sitemap index file -SITEINDEX_SUFFIX = '_index.xml' - -# Regular expressions tried for extracting URLs from access logs. -ACCESSLOG_CLF_PATTERN = re.compile( - r'.+\s+"([^\s]+)\s+([^\s]+)\s+HTTP/\d+\.\d+"\s+200\s+.*' - ) - -# Match patterns for lastmod attributes -LASTMOD_PATTERNS = map(re.compile, [ - r'^\d\d\d\d$', - r'^\d\d\d\d-\d\d$', - r'^\d\d\d\d-\d\d-\d\d$', - r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\dZ$', - r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d[+-]\d\d:\d\d$', - r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?Z$', - r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?[+-]\d\d:\d\d$', - ]) - -# Match patterns for changefreq attributes -CHANGEFREQ_PATTERNS = [ - 'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never' - ] - -# XML formats -SITEINDEX_HEADER = \ - '\n' \ - '\n' -SITEINDEX_FOOTER = '\n' -SITEINDEX_ENTRY = \ - ' \n' \ - ' %(loc)s\n' \ - ' %(lastmod)s\n' \ - ' \n' -SITEMAP_HEADER = \ - '\n' \ - '\n' -SITEMAP_FOOTER = '\n' -SITEURL_XML_PREFIX = ' \n' -SITEURL_XML_SUFFIX = ' \n' - -# Search engines to notify with the updated sitemaps -# -# This list is very non-obvious in what's going on. Here's the gist: -# Each item in the list is a 6-tuple of items. The first 5 are "almost" -# the same as the input arguments to urlparse.urlunsplit(): -# 0 - schema -# 1 - netloc -# 2 - path -# 3 - query <-- EXCEPTION: specify a query map rather than a string -# 4 - fragment -# Additionally, add item 5: -# 5 - query attribute that should be set to the new Sitemap URL -# Clear as mud, I know. -NOTIFICATION_SITES = [ - ('http', 'www.google.com', 'webmasters/sitemaps/ping', {}, '', 'sitemap') - ] - - -class Error(Exception): - """ - Base exception class. In this module we tend not to use our own exception - types for very much, but they come in very handy on XML parsing with SAX. - """ - pass -#end class Error - - -class SchemaError(Error): - """Failure to process an XML file according to the schema we know.""" - pass -#end class SchemeError - - -class Encoder: - """ - Manages wide-character/narrow-character conversions for just about all - text that flows into or out of the script. - - You should always use this class for string coercion, as opposed to - letting Python handle coercions automatically. Reason: Python - usually assumes ASCII (7-bit) as a default narrow character encoding, - which is not the kind of data we generally deal with. - - General high-level methodologies used in sitemap_gen: - - [PATHS] - File system paths may be wide or narrow, depending on platform. - This works fine, just be aware of it and be very careful to not - mix them. That is, if you have to pass several file path arguments - into a library call, make sure they are all narrow or all wide. - This class has MaybeNarrowPath() which should be called on every - file system path you deal with. - - [URLS] - URL locations are stored in Narrow form, already escaped. This has the - benefit of keeping escaping and encoding as close as possible to the format - we read them in. The downside is we may end up with URLs that have - intermingled encodings -- the root path may be encoded in one way - while the filename is encoded in another. This is obviously wrong, but - it should hopefully be an issue hit by very few users. The workaround - from the user level (assuming they notice) is to specify a default_encoding - parameter in their config file. - - [OTHER] - Other text, such as attributes of the URL class, configuration options, - etc, are generally stored in Unicode for simplicity. - """ - - def __init__(self): - self._user = None # User-specified default encoding - self._learned = [] # Learned default encodings - self._widefiles = False # File system can be wide - - # Can the file system be Unicode? - try: - self._widefiles = os.path.supports_unicode_filenames - except AttributeError: - try: - self._widefiles = sys.getwindowsversion() == os.VER_PLATFORM_WIN32_NT - except AttributeError: - pass - - # Try to guess a working default - try: - encoding = sys.getfilesystemencoding() - if encoding and not (encoding.upper() in ENC_ASCII_LIST): - self._learned = [ encoding ] - except AttributeError: - pass - - if not self._learned: - encoding = sys.getdefaultencoding() - if encoding and not (encoding.upper() in ENC_ASCII_LIST): - self._learned = [ encoding ] - - # If we had no guesses, start with some European defaults - if not self._learned: - self._learned = ENC_DEFAULT_LIST - #end def __init__ - - def SetUserEncoding(self, encoding): - self._user = encoding - #end def SetUserEncoding - - def NarrowText(self, text, encoding): - """ Narrow a piece of arbitrary text """ - if type(text) != types.UnicodeType: - return text - - # Try the passed in preference - if encoding: - try: - result = text.encode(encoding) - if not encoding in self._learned: - self._learned.append(encoding) - return result - except UnicodeError: - pass - except LookupError: - output.Warn('Unknown encoding: %s' % encoding) - - # Try the user preference - if self._user: - try: - return text.encode(self._user) - except UnicodeError: - pass - except LookupError: - temp = self._user - self._user = None - output.Warn('Unknown default_encoding: %s' % temp) - - # Look through learned defaults, knock any failing ones out of the list - while self._learned: - try: - return text.encode(self._learned[0]) - except: - del self._learned[0] - - # When all other defaults are exhausted, use UTF-8 - try: - return text.encode(ENC_UTF8) - except UnicodeError: - pass - - # Something is seriously wrong if we get to here - return text.encode(ENC_ASCII, 'ignore') - #end def NarrowText - - def MaybeNarrowPath(self, text): - """ Paths may be allowed to stay wide """ - if self._widefiles: - return text - return self.NarrowText(text, None) - #end def MaybeNarrowPath - - def WidenText(self, text, encoding): - """ Widen a piece of arbitrary text """ - if type(text) != types.StringType: - return text - - # Try the passed in preference - if encoding: - try: - result = unicode(text, encoding) - if not encoding in self._learned: - self._learned.append(encoding) - return result - except UnicodeError: - pass - except LookupError: - output.Warn('Unknown encoding: %s' % encoding) - - # Try the user preference - if self._user: - try: - return unicode(text, self._user) - except UnicodeError: - pass - except LookupError: - temp = self._user - self._user = None - output.Warn('Unknown default_encoding: %s' % temp) - - # Look through learned defaults, knock any failing ones out of the list - while self._learned: - try: - return unicode(text, self._learned[0]) - except: - del self._learned[0] - - # When all other defaults are exhausted, use UTF-8 - try: - return unicode(text, ENC_UTF8) - except UnicodeError: - pass - - # Getting here means it wasn't UTF-8 and we had no working default. - # We really don't have anything "right" we can do anymore. - output.Warn('Unrecognized encoding in text: %s' % text) - if not self._user: - output.Warn('You may need to set a default_encoding in your ' - 'configuration file.') - return text.decode(ENC_ASCII, 'ignore') - #end def WidenText -#end class Encoder -encoder = Encoder() - - -class Output: - """ - Exposes logging functionality, and tracks how many errors - we have thus output. - - Logging levels should be used as thus: - Fatal -- extremely sparingly - Error -- config errors, entire blocks of user 'intention' lost - Warn -- individual URLs lost - Log(,0) -- Un-suppressable text that's not an error - Log(,1) -- touched files, major actions - Log(,2) -- parsing notes, filtered or duplicated URLs - Log(,3) -- each accepted URL - """ - - def __init__(self): - self.num_errors = 0 # Count of errors - self.num_warns = 0 # Count of warnings - - self._errors_shown = {} # Shown errors - self._warns_shown = {} # Shown warnings - self._verbose = 0 # Level of verbosity - #end def __init__ - - def Log(self, text, level): - """ Output a blurb of diagnostic text, if the verbose level allows it """ - if text: - text = encoder.NarrowText(text, None) - if self._verbose >= level: - print text - #end def Log - - def Warn(self, text): - """ Output and count a warning. Suppress duplicate warnings. """ - if text: - text = encoder.NarrowText(text, None) - hash = hashlib.md5(text).hexdigest() - if not self._warns_shown.has_key(hash): - self._warns_shown[hash] = 1 - print '[WARNING] ' + text - else: - self.Log('(suppressed) [WARNING] ' + text, 3) - self.num_warns = self.num_warns + 1 - #end def Warn - - def Error(self, text): - """ Output and count an error. Suppress duplicate errors. """ - if text: - text = encoder.NarrowText(text, None) - hash = hashlib.md5(text).hexdigest() - if not self._errors_shown.has_key(hash): - self._errors_shown[hash] = 1 - print '[ERROR] ' + text - else: - self.Log('(suppressed) [ERROR] ' + text, 3) - self.num_errors = self.num_errors + 1 - #end def Error - - def Fatal(self, text): - """ Output an error and terminate the program. """ - if text: - text = encoder.NarrowText(text, None) - print '[FATAL] ' + text - else: - print 'Fatal error.' - sys.exit(1) - #end def Fatal - - def SetVerbose(self, level): - """ Sets the verbose level. """ - try: - if type(level) != types.IntType: - level = int(level) - if (level >= 0) and (level <= 3): - self._verbose = level - return - except ValueError: - pass - self.Error('Verbose level (%s) must be between 0 and 3 inclusive.' % level) - #end def SetVerbose -#end class Output -output = Output() - - -class URL(object): - """ URL is a smart structure grouping together the properties we - care about for a single web reference. """ - __slots__ = 'loc', 'lastmod', 'changefreq', 'priority' - - def __init__(self): - self.loc = None # URL -- in Narrow characters - self.lastmod = None # ISO8601 timestamp of last modify - self.changefreq = None # Text term for update frequency - self.priority = None # Float between 0 and 1 (inc) - #end def __init__ - - def __cmp__(self, other): - if self.loc < other.loc: - return -1 - if self.loc > other.loc: - return 1 - return 0 - #end def __cmp__ - - def TrySetAttribute(self, attribute, value): - """ Attempt to set the attribute to the value, with a pretty try - block around it. """ - if attribute == 'loc': - self.loc = self.Canonicalize(value) - else: - try: - setattr(self, attribute, value) - except AttributeError: - output.Warn('Unknown URL attribute: %s' % attribute) - #end def TrySetAttribute - - def IsAbsolute(loc): - """ Decide if the URL is absolute or not """ - if not loc: - return False - narrow = encoder.NarrowText(loc, None) - (scheme, netloc, path, query, frag) = urlparse.urlsplit(narrow) - if (not scheme) or (not netloc): - return False - return True - #end def IsAbsolute - IsAbsolute = staticmethod(IsAbsolute) - - def Canonicalize(loc): - """ Do encoding and canonicalization on a URL string """ - if not loc: - return loc - - # Let the encoder try to narrow it - narrow = encoder.NarrowText(loc, None) - - # Escape components individually - (scheme, netloc, path, query, frag) = urlparse.urlsplit(narrow) - unr = '-._~' - sub = '!$&\'()*+,;=' - netloc = urllib.quote(netloc, unr + sub + '%:@/[]') - path = urllib.quote(path, unr + sub + '%:@/') - query = urllib.quote(query, unr + sub + '%:@/?') - frag = urllib.quote(frag, unr + sub + '%:@/?') - - # Try built-in IDNA encoding on the netloc - try: - (ignore, widenetloc, ignore, ignore, ignore) = urlparse.urlsplit(loc) - for c in widenetloc: - if c >= unichr(128): - netloc = widenetloc.encode(ENC_IDNA) - netloc = urllib.quote(netloc, unr + sub + '%:@/[]') - break - except UnicodeError: - # urlsplit must have failed, based on implementation differences in the - # library. There is not much we can do here, except ignore it. - pass - except LookupError: - output.Warn('An International Domain Name (IDN) is being used, but this ' - 'version of Python does not have support for IDNA encoding. ' - ' (IDNA support was introduced in Python 2.3) The encoding ' - 'we have used instead is wrong and will probably not yield ' - 'valid URLs.') - bad_netloc = False - if '%' in netloc: - bad_netloc = True - - # Put it all back together - narrow = urlparse.urlunsplit((scheme, netloc, path, query, frag)) - - # I let '%' through. Fix any that aren't pre-existing escapes. - HEXDIG = '0123456789abcdefABCDEF' - list = narrow.split('%') - narrow = list[0] - del list[0] - for item in list: - if (len(item) >= 2) and (item[0] in HEXDIG) and (item[1] in HEXDIG): - narrow = narrow + '%' + item - else: - narrow = narrow + '%25' + item - - # Issue a warning if this is a bad URL - if bad_netloc: - output.Warn('Invalid characters in the host or domain portion of a URL: ' - + narrow) - - return narrow - #end def Canonicalize - Canonicalize = staticmethod(Canonicalize) - - def Validate(self, base_url, allow_fragment): - """ Verify the data in this URL is well-formed, and override if not. """ - assert type(base_url) == types.StringType - - # Test (and normalize) the ref - if not self.loc: - output.Warn('Empty URL') - return False - if allow_fragment: - self.loc = urlparse.urljoin(base_url, self.loc) - if not self.loc.startswith(base_url): - output.Warn('Discarded URL for not starting with the base_url: %s' % - self.loc) - self.loc = None - return False - - # Test the lastmod - if self.lastmod: - match = False - self.lastmod = self.lastmod.upper() - for pattern in LASTMOD_PATTERNS: - match = pattern.match(self.lastmod) - if match: - break - if not match: - output.Warn('Lastmod "%s" does not appear to be in ISO8601 format on ' - 'URL: %s' % (self.lastmod, self.loc)) - self.lastmod = None - - # Test the changefreq - if self.changefreq: - match = False - self.changefreq = self.changefreq.lower() - for pattern in CHANGEFREQ_PATTERNS: - if self.changefreq == pattern: - match = True - break - if not match: - output.Warn('Changefreq "%s" is not a valid change frequency on URL ' - ': %s' % (self.changefreq, self.loc)) - self.changefreq = None - - # Test the priority - if self.priority: - priority = -1.0 - try: - priority = float(self.priority) - except ValueError: - pass - if (priority < 0.0) or (priority > 1.0): - output.Warn('Priority "%s" is not a number between 0 and 1 inclusive ' - 'on URL: %s' % (self.priority, self.loc)) - self.priority = None - - return True - #end def Validate - - def MakeHash(self): - """ Provides a uniform way of hashing URLs """ - if not self.loc: - return None - if self.loc.endswith('/'): - return hashlib.md5(self.loc[:-1]).hexdigest() - return hashlib.md5(self.loc).hexdigest() - #end def MakeHash - - def Log(self, prefix='URL', level=3): - """ Dump the contents, empty or not, to the log. """ - out = prefix + ':' - - for attribute in self.__slots__: - value = getattr(self, attribute) - if not value: - value = '' - out = out + (' %s=[%s]' % (attribute, value)) - - output.Log('%s' % encoder.NarrowText(out, None), level) - #end def Log - - def WriteXML(self, file): - """ Dump non-empty contents to the output file, in XML format. """ - if not self.loc: - return - out = SITEURL_XML_PREFIX - - for attribute in self.__slots__: - value = getattr(self, attribute) - if value: - if type(value) == types.UnicodeType: - value = encoder.NarrowText(value, None) - elif type(value) != types.StringType: - value = str(value) - value = xml.sax.saxutils.escape(value) - out = out + (' <%s>%s\n' % (attribute, value, attribute)) - - out = out + SITEURL_XML_SUFFIX - file.write(out) - #end def WriteXML -#end class URL - - -class Filter: - """ - A filter on the stream of URLs we find. A filter is, in essence, - a wildcard applied to the stream. You can think of this as an - operator that returns a tri-state when given a URL: - - True -- this URL is to be included in the sitemap - None -- this URL is undecided - False -- this URL is to be dropped from the sitemap - """ - - def __init__(self, attributes): - self._wildcard = None # Pattern for wildcard match - self._regexp = None # Pattern for regexp match - self._pass = False # "Drop" filter vs. "Pass" filter - - if not ValidateAttributes('FILTER', attributes, - ('pattern', 'type', 'action')): - return - - # Check error count on the way in - num_errors = output.num_errors - - # Fetch the attributes - pattern = attributes.get('pattern') - type = attributes.get('type', 'wildcard') - action = attributes.get('action', 'drop') - if type: - type = type.lower() - if action: - action = action.lower() - - # Verify the attributes - if not pattern: - output.Error('On a filter you must specify a "pattern" to match') - elif (not type) or ((type != 'wildcard') and (type != 'regexp')): - output.Error('On a filter you must specify either \'type="wildcard"\' ' - 'or \'type="regexp"\'') - elif (action != 'pass') and (action != 'drop'): - output.Error('If you specify a filter action, it must be either ' - '\'action="pass"\' or \'action="drop"\'') - - # Set the rule - if action == 'drop': - self._pass = False - elif action == 'pass': - self._pass = True - - if type == 'wildcard': - self._wildcard = pattern - elif type == 'regexp': - try: - self._regexp = re.compile(pattern) - except re.error: - output.Error('Bad regular expression: %s' % pattern) - - # Log the final results iff we didn't add any errors - if num_errors == output.num_errors: - output.Log('Filter: %s any URL that matches %s "%s"' % - (action, type, pattern), 2) - #end def __init__ - - def Apply(self, url): - """ Process the URL, as above. """ - if (not url) or (not url.loc): - return None - - if self._wildcard: - if fnmatch.fnmatchcase(url.loc, self._wildcard): - return self._pass - return None - - if self._regexp: - if self._regexp.search(url.loc): - return self._pass - return None - - assert False # unreachable - #end def Apply -#end class Filter - - -class InputURL: - """ - Each Input class knows how to yield a set of URLs from a data source. - - This one handles a single URL, manually specified in the config file. - """ - - def __init__(self, attributes): - self._url = None # The lonely URL - - if not ValidateAttributes('URL', attributes, - ('href', 'lastmod', 'changefreq', 'priority')): - return - - url = URL() - for attr in attributes.keys(): - if attr == 'href': - url.TrySetAttribute('loc', attributes[attr]) - else: - url.TrySetAttribute(attr, attributes[attr]) - - if not url.loc: - output.Error('Url entries must have an href attribute.') - return - - self._url = url - output.Log('Input: From URL "%s"' % self._url.loc, 2) - #end def __init__ - - def ProduceURLs(self, consumer): - """ Produces URLs from our data source, hands them in to the consumer. """ - if self._url: - consumer(self._url, True) - #end def ProduceURLs -#end class InputURL - - -class InputURLList: - """ - Each Input class knows how to yield a set of URLs from a data source. - - This one handles a text file with a list of URLs - """ - - def __init__(self, attributes): - self._path = None # The file path - self._encoding = None # Encoding of that file - - if not ValidateAttributes('URLLIST', attributes, ('path', 'encoding')): - return - - self._path = attributes.get('path') - self._encoding = attributes.get('encoding', ENC_UTF8) - if self._path: - self._path = encoder.MaybeNarrowPath(self._path) - if os.path.isfile(self._path): - output.Log('Input: From URLLIST "%s"' % self._path, 2) - else: - output.Error('Can not locate file: %s' % self._path) - self._path = None - else: - output.Error('Urllist entries must have a "path" attribute.') - #end def __init__ - - def ProduceURLs(self, consumer): - """ Produces URLs from our data source, hands them in to the consumer. """ - - # Open the file - (frame, file) = OpenFileForRead(self._path, 'URLLIST') - if not file: - return - - # Iterate lines - linenum = 0 - for line in file.readlines(): - linenum = linenum + 1 - - # Strip comments and empty lines - if self._encoding: - line = encoder.WidenText(line, self._encoding) - line = line.strip() - if (not line) or line[0] == '#': - continue - - # Split the line on space - url = URL() - cols = line.split(' ') - for i in range(0,len(cols)): - cols[i] = cols[i].strip() - url.TrySetAttribute('loc', cols[0]) - - # Extract attributes from the other columns - for i in range(1,len(cols)): - if cols[i]: - try: - (attr_name, attr_val) = cols[i].split('=', 1) - url.TrySetAttribute(attr_name, attr_val) - except ValueError: - output.Warn('Line %d: Unable to parse attribute: %s' % - (linenum, cols[i])) - - # Pass it on - consumer(url, False) - - file.close() - if frame: - frame.close() - #end def ProduceURLs -#end class InputURLList - - -class InputDirectory: - """ - Each Input class knows how to yield a set of URLs from a data source. - - This one handles a directory that acts as base for walking the filesystem. - """ - - def __init__(self, attributes, base_url): - self._path = None # The directory - self._url = None # The URL equivalent - self._default_file = None - - if not ValidateAttributes('DIRECTORY', attributes, ('path', 'url', - 'default_file')): - return - - # Prep the path -- it MUST end in a sep - path = attributes.get('path') - if not path: - output.Error('Directory entries must have both "path" and "url" ' - 'attributes') - return - path = encoder.MaybeNarrowPath(path) - if not path.endswith(os.sep): - path = path + os.sep - if not os.path.isdir(path): - output.Error('Can not locate directory: %s' % path) - return - - # Prep the URL -- it MUST end in a sep - url = attributes.get('url') - if not url: - output.Error('Directory entries must have both "path" and "url" ' - 'attributes') - return - url = URL.Canonicalize(url) - if not url.endswith('/'): - url = url + '/' - if not url.startswith(base_url): - url = urlparse.urljoin(base_url, url) - if not url.startswith(base_url): - output.Error('The directory URL "%s" is not relative to the ' - 'base_url: %s' % (url, base_url)) - return - - # Prep the default file -- it MUST be just a filename - file = attributes.get('default_file') - if file: - file = encoder.MaybeNarrowPath(file) - if os.sep in file: - output.Error('The default_file "%s" can not include path information.' - % file) - file = None - - self._path = path - self._url = url - self._default_file = file - if file: - output.Log('Input: From DIRECTORY "%s" (%s) with default file "%s"' - % (path, url, file), 2) - else: - output.Log('Input: From DIRECTORY "%s" (%s) with no default file' - % (path, url), 2) - #end def __init__ - - def ProduceURLs(self, consumer): - """ Produces URLs from our data source, hands them in to the consumer. """ - if not self._path: - return - - root_path = self._path - root_URL = self._url - root_file = self._default_file - - def PerFile(dirpath, name): - """ - Called once per file. - Note that 'name' will occasionally be None -- for a directory itself - """ - # Pull a timestamp - url = URL() - isdir = False - try: - if name: - path = os.path.join(dirpath, name) - else: - path = dirpath - isdir = os.path.isdir(path) - time = None - if isdir and root_file: - file = os.path.join(path, root_file) - try: - time = os.stat(file)[stat.ST_MTIME]; - except OSError: - pass - if not time: - time = os.stat(path)[stat.ST_MTIME]; - url.lastmod = TimestampISO8601(time) - except OSError: - pass - except ValueError: - pass - - # Build a URL - middle = dirpath[len(root_path):] - if os.sep != '/': - middle = middle.replace(os.sep, '/') - if middle: - middle = middle + '/' - if name: - middle = middle + name - if isdir: - middle = middle + '/' - url.TrySetAttribute('loc', root_URL + encoder.WidenText(middle, None)) - - # Suppress default files. (All the way down here so we can log it.) - if name and (root_file == name): - url.Log(prefix='IGNORED (default file)', level=2) - return - - consumer(url, False) - #end def PerFile - - def PerDirectory(ignore, dirpath, namelist): - """ - Called once per directory with a list of all the contained files/dirs. - """ - ignore = ignore # Avoid warnings of an unused parameter - - if not dirpath.startswith(root_path): - output.Warn('Unable to decide what the root path is for directory: ' - '%s' % dirpath) - return - - for name in namelist: - PerFile(dirpath, name) - #end def PerDirectory - - output.Log('Walking DIRECTORY "%s"' % self._path, 1) - PerFile(self._path, None) - os.path.walk(self._path, PerDirectory, None) - #end def ProduceURLs -#end class InputDirectory - - -class InputAccessLog: - """ - Each Input class knows how to yield a set of URLs from a data source. - - This one handles access logs. It's non-trivial in that we want to - auto-detect log files in the Common Logfile Format (as used by Apache, - for instance) and the Extended Log File Format (as used by IIS, for - instance). - """ - - def __init__(self, attributes): - self._path = None # The file path - self._encoding = None # Encoding of that file - self._is_elf = False # Extended Log File Format? - self._is_clf = False # Common Logfile Format? - self._elf_status = -1 # ELF field: '200' - self._elf_method = -1 # ELF field: 'HEAD' - self._elf_uri = -1 # ELF field: '/foo?bar=1' - self._elf_urifrag1 = -1 # ELF field: '/foo' - self._elf_urifrag2 = -1 # ELF field: 'bar=1' - - if not ValidateAttributes('ACCESSLOG', attributes, ('path', 'encoding')): - return - - self._path = attributes.get('path') - self._encoding = attributes.get('encoding', ENC_UTF8) - if self._path: - self._path = encoder.MaybeNarrowPath(self._path) - if os.path.isfile(self._path): - output.Log('Input: From ACCESSLOG "%s"' % self._path, 2) - else: - output.Error('Can not locate file: %s' % self._path) - self._path = None - else: - output.Error('Accesslog entries must have a "path" attribute.') - #end def __init__ - - def RecognizeELFLine(self, line): - """ Recognize the Fields directive that heads an ELF file """ - if not line.startswith('#Fields:'): - return False - fields = line.split(' ') - del fields[0] - for i in range(0, len(fields)): - field = fields[i].strip() - if field == 'sc-status': - self._elf_status = i - elif field == 'cs-method': - self._elf_method = i - elif field == 'cs-uri': - self._elf_uri = i - elif field == 'cs-uri-stem': - self._elf_urifrag1 = i - elif field == 'cs-uri-query': - self._elf_urifrag2 = i - output.Log('Recognized an Extended Log File Format file.', 2) - return True - #end def RecognizeELFLine - - def GetELFLine(self, line): - """ Fetch the requested URL from an ELF line """ - fields = line.split(' ') - count = len(fields) - - # Verify status was Ok - if self._elf_status >= 0: - if self._elf_status >= count: - return None - if not fields[self._elf_status].strip() == '200': - return None - - # Verify method was HEAD or GET - if self._elf_method >= 0: - if self._elf_method >= count: - return None - if not fields[self._elf_method].strip() in ('HEAD', 'GET'): - return None - - # Pull the full URL if we can - if self._elf_uri >= 0: - if self._elf_uri >= count: - return None - url = fields[self._elf_uri].strip() - if url != '-': - return url - - # Put together a fragmentary URL - if self._elf_urifrag1 >= 0: - if self._elf_urifrag1 >= count or self._elf_urifrag2 >= count: - return None - urlfrag1 = fields[self._elf_urifrag1].strip() - urlfrag2 = None - if self._elf_urifrag2 >= 0: - urlfrag2 = fields[self._elf_urifrag2] - if urlfrag1 and (urlfrag1 != '-'): - if urlfrag2 and (urlfrag2 != '-'): - urlfrag1 = urlfrag1 + '?' + urlfrag2 - return urlfrag1 - - return None - #end def GetELFLine - - def RecognizeCLFLine(self, line): - """ Try to tokenize a logfile line according to CLF pattern and see if - it works. """ - match = ACCESSLOG_CLF_PATTERN.match(line) - recognize = match and (match.group(1) in ('HEAD', 'GET')) - if recognize: - output.Log('Recognized a Common Logfile Format file.', 2) - return recognize - #end def RecognizeCLFLine - - def GetCLFLine(self, line): - """ Fetch the requested URL from a CLF line """ - match = ACCESSLOG_CLF_PATTERN.match(line) - if match: - request = match.group(1) - if request in ('HEAD', 'GET'): - return match.group(2) - return None - #end def GetCLFLine - - def ProduceURLs(self, consumer): - """ Produces URLs from our data source, hands them in to the consumer. """ - - # Open the file - (frame, file) = OpenFileForRead(self._path, 'ACCESSLOG') - if not file: - return - - # Iterate lines - for line in file.readlines(): - if self._encoding: - line = encoder.WidenText(line, self._encoding) - line = line.strip() - - # If we don't know the format yet, try them both - if (not self._is_clf) and (not self._is_elf): - self._is_elf = self.RecognizeELFLine(line) - self._is_clf = self.RecognizeCLFLine(line) - - # Digest the line - match = None - if self._is_elf: - match = self.GetELFLine(line) - elif self._is_clf: - match = self.GetCLFLine(line) - if not match: - continue - - # Pass it on - url = URL() - url.TrySetAttribute('loc', match) - consumer(url, True) - - file.close() - if frame: - frame.close() - #end def ProduceURLs -#end class InputAccessLog - - -class InputSitemap(xml.sax.handler.ContentHandler): - - """ - Each Input class knows how to yield a set of URLs from a data source. - - This one handles Sitemap files and Sitemap index files. For the sake - of simplicity in design (and simplicity in interfacing with the SAX - package), we do not handle these at the same time, recursively. Instead - we read an index file completely and make a list of Sitemap files, then - go back and process each Sitemap. - """ - - class _ContextBase(object): - - """Base class for context handlers in our SAX processing. A context - handler is a class that is responsible for understanding one level of - depth in the XML schema. The class knows what sub-tags are allowed, - and doing any processing specific for the tag we're in. - - This base class is the API filled in by specific context handlers, - all defined below. - """ - - def __init__(self, subtags): - """Initialize with a sequence of the sub-tags that would be valid in - this context.""" - self._allowed_tags = subtags # Sequence of sub-tags we can have - self._last_tag = None # Most recent seen sub-tag - #end def __init__ - - def AcceptTag(self, tag): - """Returns True iff opening a sub-tag is valid in this context.""" - valid = tag in self._allowed_tags - if valid: - self._last_tag = tag - else: - self._last_tag = None - return valid - #end def AcceptTag - - def AcceptText(self, text): - """Returns True iff a blurb of text is valid in this context.""" - return False - #end def AcceptText - - def Open(self): - """The context is opening. Do initialization.""" - pass - #end def Open - - def Close(self): - """The context is closing. Return our result, if any.""" - pass - #end def Close - - def Return(self, result): - """We're returning to this context after handling a sub-tag. This - method is called with the result data from the sub-tag that just - closed. Here in _ContextBase, if we ever see a result it means - the derived child class forgot to override this method.""" - if result: - raise NotImplementedError - #end def Return - #end class _ContextBase - - class _ContextUrlSet(_ContextBase): - - """Context handler for the document node in a Sitemap.""" - - def __init__(self): - InputSitemap._ContextBase.__init__(self, ('url',)) - #end def __init__ - #end class _ContextUrlSet - - class _ContextUrl(_ContextBase): - - """Context handler for a URL node in a Sitemap.""" - - def __init__(self, consumer): - """Initialize this context handler with the callable consumer that - wants our URLs.""" - InputSitemap._ContextBase.__init__(self, URL.__slots__) - self._url = None # The URL object we're building - self._consumer = consumer # Who wants to consume it - #end def __init__ - - def Open(self): - """Initialize the URL.""" - assert not self._url - self._url = URL() - #end def Open - - def Close(self): - """Pass the URL to the consumer and reset it to None.""" - assert self._url - self._consumer(self._url, False) - self._url = None - #end def Close - - def Return(self, result): - """A value context has closed, absorb the data it gave us.""" - assert self._url - if result: - self._url.TrySetAttribute(self._last_tag, result) - #end def Return - #end class _ContextUrl - - class _ContextSitemapIndex(_ContextBase): - - """Context handler for the document node in an index file.""" - - def __init__(self): - InputSitemap._ContextBase.__init__(self, ('sitemap',)) - self._loclist = [] # List of accumulated Sitemap URLs - #end def __init__ - - def Open(self): - """Just a quick verify of state.""" - assert not self._loclist - #end def Open - - def Close(self): - """Return our list of accumulated URLs.""" - if self._loclist: - temp = self._loclist - self._loclist = [] - return temp - #end def Close - - def Return(self, result): - """Getting a new loc URL, add it to the collection.""" - if result: - self._loclist.append(result) - #end def Return - #end class _ContextSitemapIndex - - class _ContextSitemap(_ContextBase): - - """Context handler for a Sitemap entry in an index file.""" - - def __init__(self): - InputSitemap._ContextBase.__init__(self, ('loc', 'lastmod')) - self._loc = None # The URL to the Sitemap - #end def __init__ - - def Open(self): - """Just a quick verify of state.""" - assert not self._loc - #end def Open - - def Close(self): - """Return our URL to our parent.""" - if self._loc: - temp = self._loc - self._loc = None - return temp - output.Warn('In the Sitemap index file, a "sitemap" entry had no "loc".') - #end def Close - - def Return(self, result): - """A value has closed. If it was a 'loc', absorb it.""" - if result and (self._last_tag == 'loc'): - self._loc = result - #end def Return - #end class _ContextSitemap - - class _ContextValue(_ContextBase): - - """Context handler for a single value. We return just the value. The - higher level context has to remember what tag led into us.""" - - def __init__(self): - InputSitemap._ContextBase.__init__(self, ()) - self._text = None - #end def __init__ - - def AcceptText(self, text): - """Allow all text, adding it to our buffer.""" - if self._text: - self._text = self._text + text - else: - self._text = text - return True - #end def AcceptText - - def Open(self): - """Initialize our buffer.""" - self._text = None - #end def Open - - def Close(self): - """Return what's in our buffer.""" - text = self._text - self._text = None - if text: - text = text.strip() - return text - #end def Close - #end class _ContextValue - - def __init__(self, attributes): - """Initialize with a dictionary of attributes from our entry in the - config file.""" - xml.sax.handler.ContentHandler.__init__(self) - self._pathlist = None # A list of files - self._current = -1 # Current context in _contexts - self._contexts = None # The stack of contexts we allow - self._contexts_idx = None # ...contexts for index files - self._contexts_stm = None # ...contexts for Sitemap files - - if not ValidateAttributes('SITEMAP', attributes, ['path']): - return - - # Init the first file path - path = attributes.get('path') - if path: - path = encoder.MaybeNarrowPath(path) - if os.path.isfile(path): - output.Log('Input: From SITEMAP "%s"' % path, 2) - self._pathlist = [path] - else: - output.Error('Can not locate file "%s"' % path) - else: - output.Error('Sitemap entries must have a "path" attribute.') - #end def __init__ - - def ProduceURLs(self, consumer): - """In general: Produces URLs from our data source, hand them to the - callable consumer. - - In specific: Iterate over our list of paths and delegate the actual - processing to helper methods. This is a complexity no other data source - needs to suffer. We are unique in that we can have files that tell us - to bring in other files. - - Note the decision to allow an index file or not is made in this method. - If we call our parser with (self._contexts == None) the parser will - grab whichever context stack can handle the file. IE: index is allowed. - If instead we set (self._contexts = ...) before parsing, the parser - will only use the stack we specify. IE: index not allowed. - """ - # Set up two stacks of contexts - self._contexts_idx = [InputSitemap._ContextSitemapIndex(), - InputSitemap._ContextSitemap(), - InputSitemap._ContextValue()] - - self._contexts_stm = [InputSitemap._ContextUrlSet(), - InputSitemap._ContextUrl(consumer), - InputSitemap._ContextValue()] - - # Process the first file - assert self._pathlist - path = self._pathlist[0] - self._contexts = None # We allow an index file here - self._ProcessFile(path) - - # Iterate over remaining files - self._contexts = self._contexts_stm # No index files allowed - for path in self._pathlist[1:]: - self._ProcessFile(path) - #end def ProduceURLs - - def _ProcessFile(self, path): - """Do per-file reading/parsing/consuming for the file path passed in.""" - assert path - - # Open our file - (frame, file) = OpenFileForRead(path, 'SITEMAP') - if not file: - return - - # Rev up the SAX engine - try: - self._current = -1 - xml.sax.parse(file, self) - except SchemaError: - output.Error('An error in file "%s" made us abort reading the Sitemap.' - % path) - except IOError: - output.Error('Cannot read from file "%s"' % path) - except xml.sax._exceptions.SAXParseException, e: - output.Error('XML error in the file "%s" (line %d, column %d): %s' % - (path, e._linenum, e._colnum, e.getMessage())) - - # Clean up - file.close() - if frame: - frame.close() - #end def _ProcessFile - - def _MungeLocationListIntoFiles(self, urllist): - """Given a list of URLs, munge them into our self._pathlist property. - We do this by assuming all the files live in the same directory as - the first file in the existing pathlist. That is, we assume a - Sitemap index points to Sitemaps only in the same directory. This - is not true in general, but will be true for any output produced - by this script. - """ - assert self._pathlist - path = self._pathlist[0] - path = os.path.normpath(path) - dir = os.path.dirname(path) - wide = False - if type(path) == types.UnicodeType: - wide = True - - for url in urllist: - url = URL.Canonicalize(url) - output.Log('Index points to Sitemap file at: %s' % url, 2) - (scheme, netloc, path, query, frag) = urlparse.urlsplit(url) - file = os.path.basename(path) - file = urllib.unquote(file) - if wide: - file = encoder.WidenText(file) - if dir: - file = dir + os.sep + file - if file: - self._pathlist.append(file) - output.Log('Will attempt to read Sitemap file: %s' % file, 1) - #end def _MungeLocationListIntoFiles - - def startElement(self, tag, attributes): - """SAX processing, called per node in the config stream. - As long as the new tag is legal in our current context, this - becomes an Open call on one context deeper. - """ - # If this is the document node, we may have to look for a context stack - if (self._current < 0) and not self._contexts: - assert self._contexts_idx and self._contexts_stm - if tag == 'urlset': - self._contexts = self._contexts_stm - elif tag == 'sitemapindex': - self._contexts = self._contexts_idx - output.Log('File is a Sitemap index.', 2) - else: - output.Error('The document appears to be neither a Sitemap nor a ' - 'Sitemap index.') - raise SchemaError - - # Display a kinder error on a common mistake - if (self._current < 0) and (self._contexts == self._contexts_stm) and ( - tag == 'sitemapindex'): - output.Error('A Sitemap index can not refer to another Sitemap index.') - raise SchemaError - - # Verify no unexpected attributes - if attributes: - text = '' - for attr in attributes.keys(): - # The document node will probably have namespaces - if self._current < 0: - if attr.find('xmlns') >= 0: - continue - if attr.find('xsi') >= 0: - continue - if text: - text = text + ', ' - text = text + attr - if text: - output.Warn('Did not expect any attributes on any tag, instead tag ' - '"%s" had attributes: %s' % (tag, text)) - - # Switch contexts - if (self._current < 0) or (self._contexts[self._current].AcceptTag(tag)): - self._current = self._current + 1 - assert self._current < len(self._contexts) - self._contexts[self._current].Open() - else: - output.Error('Can not accept tag "%s" where it appears.' % tag) - raise SchemaError - #end def startElement - - def endElement(self, tag): - """SAX processing, called per node in the config stream. - This becomes a call to Close on one context followed by a call - to Return on the previous. - """ - tag = tag # Avoid warning on unused argument - assert self._current >= 0 - retval = self._contexts[self._current].Close() - self._current = self._current - 1 - if self._current >= 0: - self._contexts[self._current].Return(retval) - elif retval and (self._contexts == self._contexts_idx): - self._MungeLocationListIntoFiles(retval) - #end def endElement - - def characters(self, text): - """SAX processing, called when text values are read. Important to - note that one single text value may be split across multiple calls - of this method. - """ - if (self._current < 0) or ( - not self._contexts[self._current].AcceptText(text)): - if text.strip(): - output.Error('Can not accept text "%s" where it appears.' % text) - raise SchemaError - #end def characters -#end class InputSitemap - - -class FilePathGenerator: - """ - This class generates filenames in a series, upon request. - You can request any iteration number at any time, you don't - have to go in order. - - Example of iterations for '/path/foo.xml.gz': - 0 --> /path/foo.xml.gz - 1 --> /path/foo1.xml.gz - 2 --> /path/foo2.xml.gz - _index.xml --> /path/foo_index.xml - """ - - def __init__(self): - self.is_gzip = False # Is this a GZIP file? - - self._path = None # '/path/' - self._prefix = None # 'foo' - self._suffix = None # '.xml.gz' - #end def __init__ - - def Preload(self, path): - """ Splits up a path into forms ready for recombination. """ - path = encoder.MaybeNarrowPath(path) - - # Get down to a base name - path = os.path.normpath(path) - base = os.path.basename(path).lower() - if not base: - output.Error('Couldn\'t parse the file path: %s' % path) - return False - lenbase = len(base) - - # Recognize extension - lensuffix = 0 - compare_suffix = ['.xml', '.xml.gz', '.gz'] - for suffix in compare_suffix: - if base.endswith(suffix): - lensuffix = len(suffix) - break - if not lensuffix: - output.Error('The path "%s" doesn\'t end in a supported file ' - 'extension.' % path) - return False - self.is_gzip = suffix.endswith('.gz') - - # Split the original path - lenpath = len(path) - self._path = path[:lenpath-lenbase] - self._prefix = path[lenpath-lenbase:lenpath-lensuffix] - self._suffix = path[lenpath-lensuffix:] - - return True - #end def Preload - - def GeneratePath(self, instance): - """ Generates the iterations, as described above. """ - prefix = self._path + self._prefix - if type(instance) == types.IntType: - if instance: - return '%s%d%s' % (prefix, instance, self._suffix) - return prefix + self._suffix - return prefix + instance - #end def GeneratePath - - def GenerateURL(self, instance, root_url): - """ Generates iterations, but as a URL instead of a path. """ - prefix = root_url + self._prefix - retval = None - if type(instance) == types.IntType: - if instance: - retval = '%s%d%s' % (prefix, instance, self._suffix) - else: - retval = prefix + self._suffix - else: - retval = prefix + instance - return URL.Canonicalize(retval) - #end def GenerateURL - - def GenerateWildURL(self, root_url): - """ Generates a wildcard that should match all our iterations """ - prefix = URL.Canonicalize(root_url + self._prefix) - temp = URL.Canonicalize(prefix + self._suffix) - suffix = temp[len(prefix):] - return prefix + '*' + suffix - #end def GenerateURL -#end class FilePathGenerator - - -class PerURLStatistics: - """ Keep track of some simple per-URL statistics, like file extension. """ - - def __init__(self): - self._extensions = {} # Count of extension instances - #end def __init__ - - def Consume(self, url): - """ Log some stats for the URL. At the moment, that means extension. """ - if url and url.loc: - (scheme, netloc, path, query, frag) = urlparse.urlsplit(url.loc) - if not path: - return - - # Recognize directories - if path.endswith('/'): - if self._extensions.has_key('/'): - self._extensions['/'] = self._extensions['/'] + 1 - else: - self._extensions['/'] = 1 - return - - # Strip to a filename - i = path.rfind('/') - if i >= 0: - assert i < len(path) - path = path[i:] - - # Find extension - i = path.rfind('.') - if i > 0: - assert i < len(path) - ext = path[i:].lower() - if self._extensions.has_key(ext): - self._extensions[ext] = self._extensions[ext] + 1 - else: - self._extensions[ext] = 1 - else: - if self._extensions.has_key('(no extension)'): - self._extensions['(no extension)'] = self._extensions[ - '(no extension)'] + 1 - else: - self._extensions['(no extension)'] = 1 - #end def Consume - - def Log(self): - """ Dump out stats to the output. """ - if len(self._extensions): - output.Log('Count of file extensions on URLs:', 1) - set = self._extensions.keys() - set.sort() - for ext in set: - output.Log(' %7d %s' % (self._extensions[ext], ext), 1) - #end def Log - -class Sitemap(xml.sax.handler.ContentHandler): - """ - This is the big workhorse class that processes your inputs and spits - out sitemap files. It is built as a SAX handler for set up purposes. - That is, it processes an XML stream to bring itself up. - """ - - def __init__(self, suppress_notify): - xml.sax.handler.ContentHandler.__init__(self) - self._filters = [] # Filter objects - self._inputs = [] # Input objects - self._urls = {} # Maps URLs to count of dups - self._set = [] # Current set of URLs - self._filegen = None # Path generator for output files - self._wildurl1 = None # Sitemap URLs to filter out - self._wildurl2 = None # Sitemap URLs to filter out - self._sitemaps = 0 # Number of output files - # We init _dup_max to 2 so the default priority is 0.5 instead of 1.0 - self._dup_max = 2 # Max number of duplicate URLs - self._stat = PerURLStatistics() # Some simple stats - self._in_site = False # SAX: are we in a Site node? - self._in_Site_ever = False # SAX: were we ever in a Site? - - self._default_enc = None # Best encoding to try on URLs - self._base_url = None # Prefix to all valid URLs - self._store_into = None # Output filepath - self._suppress = suppress_notify # Suppress notify of servers - #end def __init__ - - def ValidateBasicConfig(self): - """ Verifies (and cleans up) the basic user-configurable options. """ - all_good = True - - if self._default_enc: - encoder.SetUserEncoding(self._default_enc) - - # Canonicalize the base_url - if all_good and not self._base_url: - output.Error('A site needs a "base_url" attribute.') - all_good = False - if all_good and not URL.IsAbsolute(self._base_url): - output.Error('The "base_url" must be absolute, not relative: %s' % - self._base_url) - all_good = False - if all_good: - self._base_url = URL.Canonicalize(self._base_url) - if not self._base_url.endswith('/'): - self._base_url = self._base_url + '/' - output.Log('BaseURL is set to: %s' % self._base_url, 2) - - # Load store_into into a generator - if all_good: - if self._store_into: - self._filegen = FilePathGenerator() - if not self._filegen.Preload(self._store_into): - all_good = False - else: - output.Error('A site needs a "store_into" attribute.') - all_good = False - - # Ask the generator for patterns on what its output will look like - if all_good: - self._wildurl1 = self._filegen.GenerateWildURL(self._base_url) - self._wildurl2 = self._filegen.GenerateURL(SITEINDEX_SUFFIX, - self._base_url) - - # Unify various forms of False - if all_good: - if self._suppress: - if (type(self._suppress) == types.StringType) or (type(self._suppress) - == types.UnicodeType): - if (self._suppress == '0') or (self._suppress.lower() == 'false'): - self._suppress = False - - # Done - if not all_good: - output.Log('See "example_config.xml" for more information.', 0) - return all_good - #end def ValidateBasicConfig - - def Generate(self): - """ Run over all the Inputs and ask them to Produce """ - # Run the inputs - for input in self._inputs: - input.ProduceURLs(self.ConsumeURL) - - # Do last flushes - if len(self._set): - self.FlushSet() - if not self._sitemaps: - output.Warn('No URLs were recorded, writing an empty sitemap.') - self.FlushSet() - - # Write an index as needed - if self._sitemaps > 1: - self.WriteIndex() - - # Notify - self.NotifySearch() - - # Dump stats - self._stat.Log() - #end def Generate - - def ConsumeURL(self, url, allow_fragment): - """ - All per-URL processing comes together here, regardless of Input. - Here we run filters, remove duplicates, spill to disk as needed, etc. - """ - if not url: - return - - # Validate - if not url.Validate(self._base_url, allow_fragment): - return - - # Run filters - accept = None - for filter in self._filters: - accept = filter.Apply(url) - if accept != None: - break - if not (accept or (accept == None)): - url.Log(prefix='FILTERED', level=2) - return - - # Ignore our out output URLs - if fnmatch.fnmatchcase(url.loc, self._wildurl1) or fnmatch.fnmatchcase( - url.loc, self._wildurl2): - url.Log(prefix='IGNORED (output file)', level=2) - return - - # Note the sighting - hash = url.MakeHash() - if self._urls.has_key(hash): - dup = self._urls[hash] - if dup > 0: - dup = dup + 1 - self._urls[hash] = dup - if self._dup_max < dup: - self._dup_max = dup - url.Log(prefix='DUPLICATE') - return - - # Acceptance -- add to set - self._urls[hash] = 1 - self._set.append(url) - self._stat.Consume(url) - url.Log() - - # Flush the set if needed - if len(self._set) >= MAXURLS_PER_SITEMAP: - self.FlushSet() - #end def ConsumeURL - - def FlushSet(self): - """ - Flush the current set of URLs to the output. This is a little - slow because we like to sort them all and normalize the priorities - before dumping. - """ - - # Sort and normalize - output.Log('Sorting and normalizing collected URLs.', 1) - self._set.sort() - for url in self._set: - hash = url.MakeHash() - dup = self._urls[hash] - if dup > 0: - self._urls[hash] = -1 - if not url.priority: - url.priority = '%.4f' % (float(dup) / float(self._dup_max)) - - # Get the filename we're going to write to - filename = self._filegen.GeneratePath(self._sitemaps) - if not filename: - output.Fatal('Unexpected: Couldn\'t generate output filename.') - self._sitemaps = self._sitemaps + 1 - output.Log('Writing Sitemap file "%s" with %d URLs' % - (filename, len(self._set)), 1) - - # Write to it - frame = None - file = None - - try: - if self._filegen.is_gzip: - basename = os.path.basename(filename); - frame = open(filename, 'wb') - file = gzip.GzipFile(fileobj=frame, filename=basename, mode='wt') - else: - file = open(filename, 'wt') - - file.write(SITEMAP_HEADER) - for url in self._set: - url.WriteXML(file) - file.write(SITEMAP_FOOTER) - - file.close() - if frame: - frame.close() - - frame = None - file = None - except IOError: - output.Fatal('Couldn\'t write out to file: %s' % filename) - os.chmod(filename, 0644) - - # Flush - self._set = [] - #end def FlushSet - - def WriteIndex(self): - """ Write the master index of all Sitemap files """ - # Make a filename - filename = self._filegen.GeneratePath(SITEINDEX_SUFFIX) - if not filename: - output.Fatal('Unexpected: Couldn\'t generate output index filename.') - output.Log('Writing index file "%s" with %d Sitemaps' % - (filename, self._sitemaps), 1) - - # Make a lastmod time - lastmod = TimestampISO8601(time.time()) - - # Write to it - try: - fd = open(filename, 'wt') - fd.write(SITEINDEX_HEADER) - - for mapnumber in range(0,self._sitemaps): - # Write the entry - mapurl = self._filegen.GenerateURL(mapnumber, self._base_url) - mapattributes = { 'loc' : mapurl, 'lastmod' : lastmod } - fd.write(SITEINDEX_ENTRY % mapattributes) - - fd.write(SITEINDEX_FOOTER) - - fd.close() - fd = None - except IOError: - output.Fatal('Couldn\'t write out to file: %s' % filename) - os.chmod(filename, 0644) - #end def WriteIndex - - def NotifySearch(self): - """ Send notification of the new Sitemap(s) to the search engines. """ - if self._suppress: - output.Log('Search engine notification is suppressed.', 1) - return - - output.Log('Notifying search engines.', 1) - - # Override the urllib's opener class with one that doesn't ignore 404s - class ExceptionURLopener(urllib.FancyURLopener): - def http_error_default(self, url, fp, errcode, errmsg, headers): - output.Log('HTTP error %d: %s' % (errcode, errmsg), 2) - raise IOError - #end def http_error_default - #end class ExceptionURLOpener - old_opener = urllib._urlopener - urllib._urlopener = ExceptionURLopener() - - # Build the URL we want to send in - if self._sitemaps > 1: - url = self._filegen.GenerateURL(SITEINDEX_SUFFIX, self._base_url) - else: - url = self._filegen.GenerateURL(0, self._base_url) - - # Test if we can hit it ourselves - try: - u = urllib.urlopen(url) - u.close() - except IOError: - output.Error('When attempting to access our generated Sitemap at the ' - 'following URL:\n %s\n we failed to read it. Please ' - 'verify the store_into path you specified in\n' - ' your configuration file is web-accessable. Consult ' - 'the FAQ for more\n information.' % url) - output.Warn('Proceeding to notify with an unverifyable URL.') - - # Cycle through notifications - # To understand this, see the comment near the NOTIFICATION_SITES comment - for ping in NOTIFICATION_SITES: - query_map = ping[3] - query_attr = ping[5] - query_map[query_attr] = url - query = urllib.urlencode(query_map) - notify = urlparse.urlunsplit((ping[0], ping[1], ping[2], query, ping[4])) - - # Send the notification - output.Log('Notifying: %s' % ping[1], 1) - output.Log('Notification URL: %s' % notify, 2) - try: - u = urllib.urlopen(notify) - u.read() - u.close() - except IOError: - output.Warn('Cannot contact: %s' % ping[1]) - - if old_opener: - urllib._urlopener = old_opener - #end def NotifySearch - - def startElement(self, tag, attributes): - """ SAX processing, called per node in the config stream. """ - - if tag == 'site': - if self._in_site: - output.Error('Can not nest Site entries in the configuration.') - else: - self._in_site = True - - if not ValidateAttributes('SITE', attributes, - ('verbose', 'default_encoding', 'base_url', 'store_into', - 'suppress_search_engine_notify')): - return - - verbose = attributes.get('verbose', 0) - if verbose: - output.SetVerbose(verbose) - - self._default_enc = attributes.get('default_encoding') - self._base_url = attributes.get('base_url') - self._store_into = attributes.get('store_into') - if not self._suppress: - self._suppress = attributes.get('suppress_search_engine_notify', - False) - self.ValidateBasicConfig() - - elif tag == 'filter': - self._filters.append(Filter(attributes)) - - elif tag == 'url': - self._inputs.append(InputURL(attributes)) - - elif tag == 'urllist': - for attributeset in ExpandPathAttribute(attributes, 'path'): - self._inputs.append(InputURLList(attributeset)) - - elif tag == 'directory': - self._inputs.append(InputDirectory(attributes, self._base_url)) - - elif tag == 'accesslog': - for attributeset in ExpandPathAttribute(attributes, 'path'): - self._inputs.append(InputAccessLog(attributeset)) - - elif tag == 'sitemap': - for attributeset in ExpandPathAttribute(attributes, 'path'): - self._inputs.append(InputSitemap(attributeset)) - - else: - output.Error('Unrecognized tag in the configuration: %s' % tag) - #end def startElement - - def endElement(self, tag): - """ SAX processing, called per node in the config stream. """ - if tag == 'site': - assert self._in_site - self._in_site = False - self._in_site_ever = True - #end def endElement - - def endDocument(self): - """ End of SAX, verify we can proceed. """ - if not self._in_site_ever: - output.Error('The configuration must specify a "site" element.') - else: - if not self._inputs: - output.Warn('There were no inputs to generate a sitemap from.') - #end def endDocument -#end class Sitemap - - -def ValidateAttributes(tag, attributes, goodattributes): - """ Makes sure 'attributes' does not contain any attribute not - listed in 'goodattributes' """ - all_good = True - for attr in attributes.keys(): - if not attr in goodattributes: - output.Error('Unknown %s attribute: %s' % (tag, attr)) - all_good = False - return all_good -#end def ValidateAttributes - -def ExpandPathAttribute(src, attrib): - """ Given a dictionary of attributes, return a list of dictionaries - with all the same attributes except for the one named attrib. - That one, we treat as a file path and expand into all its possible - variations. """ - # Do the path expansion. On any error, just return the source dictionary. - path = src.get(attrib) - if not path: - return [src] - path = encoder.MaybeNarrowPath(path); - pathlist = glob.glob(path) - if not pathlist: - return [src] - - # If this isn't actually a dictionary, make it one - if type(src) != types.DictionaryType: - tmp = {} - for key in src.keys(): - tmp[key] = src[key] - src = tmp - - # Create N new dictionaries - retval = [] - for path in pathlist: - dst = src.copy() - dst[attrib] = path - retval.append(dst) - - return retval -#end def ExpandPathAttribute - -def OpenFileForRead(path, logtext): - """ Opens a text file, be it GZip or plain """ - - frame = None - file = None - - if not path: - return (frame, file) - - try: - if path.endswith('.gz'): - frame = open(path, 'rb') - file = gzip.GzipFile(fileobj=frame, mode='rt') - else: - file = open(path, 'rt') - - if logtext: - output.Log('Opened %s file: %s' % (logtext, path), 1) - else: - output.Log('Opened file: %s' % path, 1) - except IOError: - output.Error('Can not open file: %s' % path) - - return (frame, file) -#end def OpenFileForRead - -def TimestampISO8601(t): - """Seconds since epoch (1970-01-01) --> ISO 8601 time string.""" - return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t)) -#end def TimestampISO8601 - -def CreateSitemapFromFile(configpath, suppress_notify): - """ Sets up a new Sitemap object from the specified configuration file. """ - - # Remember error count on the way in - num_errors = output.num_errors - - # Rev up SAX to parse the config - sitemap = Sitemap(suppress_notify) - try: - output.Log('Reading configuration file: %s' % configpath, 0) - xml.sax.parse(configpath, sitemap) - except IOError: - output.Error('Cannot read configuration file: %s' % configpath) - except xml.sax._exceptions.SAXParseException, e: - output.Error('XML error in the config file (line %d, column %d): %s' % - (e._linenum, e._colnum, e.getMessage())) - except xml.sax._exceptions.SAXReaderNotAvailable: - output.Error('Some installs of Python 2.2 did not include complete support' - ' for XML.\n Please try upgrading your version of Python' - ' and re-running the script.') - - # If we added any errors, return no sitemap - if num_errors == output.num_errors: - return sitemap - return None -#end def CreateSitemapFromFile - -def ProcessCommandFlags(args): - """ - Parse command line flags per specified usage, pick off key, value pairs - All flags of type "--key=value" will be processed as __flags[key] = value, - "--option" will be processed as __flags[option] = option - """ - - flags = {} - rkeyval = '--(?P\S*)[=](?P\S*)' # --key=val - roption = '--(?P