gunicorn/docs/sitemap_gen.py
Kevin Michel 93bcf5a41e Replace and run the sitemap generator
This replaces the very old sitemap generator which was over 2kloc and
only compatible with Python 2.

According to the stored lastmod, the generator wasn't used since 2010.

The minimal replacement script scan the static site for html files and
uses git to deduce the last modification date of each page.

The sitemap xmlns version was updated to the latest 0.9 from
sitemaps.org .

The index page was given a higher priority since the other pages
are just redirects to the index with anchors.

The output file is pretty printed to help with diffs.

Static assets (css, images...) aren't listed in the sitemap anymore.
2020-01-31 14:21:00 +01:00

42 lines
2.0 KiB
Python

import os
import subprocess
from xml.etree import ElementTree
def main():
generate(
site_path=os.path.join(os.path.dirname(__file__), 'site'),
special_priorities={'index.html': 1.0})
def generate(site_path, special_priorities, directory_index='index.html'):
urlset = ElementTree.Element('urlset', xmlns='http://www.sitemaps.org/schemas/sitemap/0.9')
urlset.text = '\n '
for root, dirs, filenames in os.walk(site_path):
for filename in filenames:
if filename.endswith('.html'):
absolute_filepath = os.path.join(root, filename)
relative_path = os.path.relpath(absolute_filepath, site_path)
relative_url = os.path.dirname(relative_path) if filename == directory_index else relative_path
last_modification = subprocess.check_output(
['git', 'log', '-1', '--pretty="%cI"', absolute_filepath]).decode('ascii').strip('\n"')
url_element = ElementTree.SubElement(urlset, 'url')
loc_element = ElementTree.SubElement(url_element, 'loc')
loc_element.text = 'http://gunicorn.org/' + relative_url
lastmod_element = ElementTree.SubElement(url_element, 'lastmod')
lastmod_element.text = last_modification
priority_element = ElementTree.SubElement(url_element, 'priority')
priority_element.text = str(special_priorities.get(relative_path, 0.5))
url_element.tail = priority_element.tail = '\n '
url_element.text = loc_element.tail = lastmod_element.tail = '\n '
# We sort the url nodes instead of the filenames because
# filenames might be altered by the directory_index option
urlset[:] = sorted([url for url in urlset], key=lambda url: url[0].text)
urlset.tail = urlset[-1].tail = '\n'
with open(os.path.join(site_path, 'sitemap.xml'), 'wb') as sitemap_file:
ElementTree.ElementTree(urlset).write(sitemap_file, encoding='UTF-8', xml_declaration=True)
if __name__ == '__main__':
main()