Commit 58bd0a3d authored by Marcus M. Scheunemann's avatar Marcus M. Scheunemann

manually adding the sitemap plugin

parent 8238058c
Sitemap
-------
This plugin generates plain-text or XML sitemaps. You can use the ``SITEMAP``
variable in your settings file to configure the behavior of the plugin.
The ``SITEMAP`` variable must be a Python dictionary and can contain these keys:
- ``format``, which sets the output format of the plugin (``xml`` or ``txt``)
- ``priorities``, which is a dictionary with three keys:
- ``articles``, the priority for the URLs of the articles and their
translations
- ``pages``, the priority for the URLs of the static pages
- ``indexes``, the priority for the URLs of the index pages, such as tags,
author pages, categories indexes, archives, etc...
All the values of this dictionary must be decimal numbers between ``0`` and ``1``.
- ``changefreqs``, which is a dictionary with three items:
- ``articles``, the update frequency of the articles
- ``pages``, the update frequency of the pages
- ``indexes``, the update frequency of the index pages
Valid frequency values are ``always``, ``hourly``, ``daily``, ``weekly``, ``monthly``,
``yearly`` and ``never``.
You can exclude URLs from being included in the sitemap via regular expressions.
For example, to exclude all URLs containing ``tag/`` or ``category/`` you can
use the following ``SITEMAP`` setting.
.. code-block:: python
SITEMAP = {
'exclude': ['tag/', 'category/']
}
If a key is missing or a value is incorrect, it will be replaced with the
default value.
You can also exclude an individual URL by adding metadata to it setting ``private``
to ``True``.
The sitemap is saved in ``<output_path>/sitemap.<format>``.
.. note::
``priorities`` and ``changefreqs`` are information for search engines.
They are only used in the XML sitemaps.
For more information: <http://www.sitemaps.org/protocol.html#xmlTagDefinitions>
**Example**
Here is an example configuration (it's also the default settings):
.. code-block:: python
# Where your plug-ins reside
PLUGIN_PATHS = ['/where/you/cloned/it/pelican-plugins/', ]
PLUGINS=['sitemap',]
SITEMAP = {
'format': 'xml',
'priorities': {
'articles': 0.5,
'indexes': 0.5,
'pages': 0.5
},
'changefreqs': {
'articles': 'monthly',
'indexes': 'daily',
'pages': 'monthly'
}
}
from .sitemap import *
\ No newline at end of file
# -*- coding: utf-8 -*-
'''
Sitemap
-------
The sitemap plugin generates plain-text or XML sitemaps.
'''
from __future__ import unicode_literals
import re
import collections
import os.path
from datetime import datetime
from logging import warning, info
from codecs import open
from pytz import timezone
from pelican import signals, contents
from pelican.utils import get_date
TXT_HEADER = """{0}/index.html
{0}/archives.html
{0}/tags.html
{0}/categories.html
"""
XML_HEADER = """<?xml version="1.0" encoding="utf-8"?>
<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
"""
XML_URL = """
<url>
<loc>{0}/{1}</loc>
<lastmod>{2}</lastmod>
<changefreq>{3}</changefreq>
<priority>{4}</priority>
</url>
"""
XML_FOOTER = """
</urlset>
"""
def format_date(date):
if date.tzinfo:
tz = date.strftime('%z')
tz = tz[:-2] + ':' + tz[-2:]
else:
tz = "-00:00"
return date.strftime("%Y-%m-%dT%H:%M:%S") + tz
class SitemapGenerator(object):
def __init__(self, context, settings, path, theme, output_path, *null):
self.output_path = output_path
self.context = context
self.now = datetime.now()
self.siteurl = settings.get('SITEURL')
self.default_timezone = settings.get('TIMEZONE', 'UTC')
self.timezone = getattr(self, 'timezone', self.default_timezone)
self.timezone = timezone(self.timezone)
self.format = 'xml'
self.changefreqs = {
'articles': 'monthly',
'indexes': 'daily',
'pages': 'monthly'
}
self.priorities = {
'articles': 0.5,
'indexes': 0.5,
'pages': 0.5
}
self.sitemapExclude = []
config = settings.get('SITEMAP', {})
if not isinstance(config, dict):
warning("sitemap plugin: the SITEMAP setting must be a dict")
else:
fmt = config.get('format')
pris = config.get('priorities')
chfreqs = config.get('changefreqs')
self.sitemapExclude = config.get('exclude', [])
if fmt not in ('xml', 'txt'):
warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'")
warning("sitemap plugin: Setting SITEMAP['format'] on `xml'")
elif fmt == 'txt':
self.format = fmt
return
valid_keys = ('articles', 'indexes', 'pages')
valid_chfreqs = ('always', 'hourly', 'daily', 'weekly', 'monthly',
'yearly', 'never')
if isinstance(pris, dict):
# We use items for Py3k compat. .iteritems() otherwise
for k, v in pris.items():
if k in valid_keys and not isinstance(v, (int, float)):
default = self.priorities[k]
warning("sitemap plugin: priorities must be numbers")
warning("sitemap plugin: setting SITEMAP['priorities']"
"['{0}'] on {1}".format(k, default))
pris[k] = default
self.priorities.update(pris)
elif pris is not None:
warning("sitemap plugin: SITEMAP['priorities'] must be a dict")
warning("sitemap plugin: using the default values")
if isinstance(chfreqs, dict):
# .items() for py3k compat.
for k, v in chfreqs.items():
if k in valid_keys and v not in valid_chfreqs:
default = self.changefreqs[k]
warning("sitemap plugin: invalid changefreq `{0}'".format(v))
warning("sitemap plugin: setting SITEMAP['changefreqs']"
"['{0}'] on '{1}'".format(k, default))
chfreqs[k] = default
self.changefreqs.update(chfreqs)
elif chfreqs is not None:
warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict")
warning("sitemap plugin: using the default values")
def write_url(self, page, fd):
if getattr(page, 'status', 'published') != 'published':
return
if getattr(page, 'private', 'False') == 'True':
return
# We can disable categories/authors/etc by using False instead of ''
if not page.save_as:
return
page_path = os.path.join(self.output_path, page.save_as)
if not os.path.exists(page_path):
return
lastdate = getattr(page, 'date', self.now)
try:
lastdate = self.get_date_modified(page, lastdate)
except ValueError:
warning("sitemap plugin: " + page.save_as + " has invalid modification date,")
warning("sitemap plugin: using date value as lastmod.")
lastmod = format_date(lastdate)
if isinstance(page, contents.Article):
pri = self.priorities['articles']
chfreq = self.changefreqs['articles']
elif isinstance(page, contents.Page):
pri = self.priorities['pages']
chfreq = self.changefreqs['pages']
else:
pri = self.priorities['indexes']
chfreq = self.changefreqs['indexes']
pageurl = '' if page.url == 'index.html' else page.url
#Exclude URLs from the sitemap:
if self.format == 'xml':
flag = False
for regstr in self.sitemapExclude:
if re.match(regstr, pageurl):
flag = True
break
if not flag:
fd.write(XML_URL.format(self.siteurl, pageurl, lastmod, chfreq, pri))
else:
fd.write(self.siteurl + '/' + pageurl + '\n')
def get_date_modified(self, page, default):
if hasattr(page, 'modified'):
if isinstance(page.modified, datetime):
return page.modified
return get_date(page.modified)
else:
return default
def set_url_wrappers_modification_date(self, wrappers):
for (wrapper, articles) in wrappers:
lastmod = datetime.min.replace(tzinfo=self.timezone)
for article in articles:
lastmod = max(lastmod, article.date.replace(tzinfo=self.timezone))
try:
modified = self.get_date_modified(article, datetime.min).replace(tzinfo=self.timezone)
lastmod = max(lastmod, modified)
except ValueError:
# Supressed: user will be notified.
pass
setattr(wrapper, 'modified', str(lastmod))
def generate_output(self, writer):
path = os.path.join(self.output_path, 'sitemap.{0}'.format(self.format))
pages = self.context['pages'] + self.context['articles'] \
+ [ c for (c, a) in self.context['categories']] \
+ [ t for (t, a) in self.context['tags']] \
+ [ a for (a, b) in self.context['authors']]
self.set_url_wrappers_modification_date(self.context['categories'])
self.set_url_wrappers_modification_date(self.context['tags'])
self.set_url_wrappers_modification_date(self.context['authors'])
for article in self.context['articles']:
pages += article.translations
info('writing {0}'.format(path))
with open(path, 'w', encoding='utf-8') as fd:
if self.format == 'xml':
fd.write(XML_HEADER)
else:
fd.write(TXT_HEADER.format(self.siteurl))
FakePage = collections.namedtuple('FakePage',
['status',
'date',
'url',
'save_as'])
for standard_page_url in ['index.html',
'archives.html',
'tags.html',
'categories.html']:
fake = FakePage(status='published',
date=self.now,
url=standard_page_url,
save_as=standard_page_url)
self.write_url(fake, fd)
# add template pages
# We use items for Py3k compat. .iteritems() otherwise
for path, template_page_url in self.context['TEMPLATE_PAGES'].items():
# don't add duplicate entry for index page
if template_page_url == 'index.html':
continue
fake = FakePage(status='published',
date=self.now,
url=template_page_url,
save_as=template_page_url)
self.write_url(fake, fd)
for page in pages:
self.write_url(page, fd)
if self.format == 'xml':
fd.write(XML_FOOTER)
def get_generators(generators):
return SitemapGenerator
def register():
signals.get_generators.connect(get_generators)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment