Commit 77d15d22 authored by Sam Ruby's avatar Sam Ruby

xml_base overrides

parent 631dd44f
...@@ -95,6 +95,13 @@ attributes on these elements.</li> ...@@ -95,6 +95,13 @@ attributes on these elements.</li>
<li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li> <li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
</ul> </ul>
</li> </li>
<li><code>xml_base</code> will adjust the <code>xml:base</code> values in effect for each of the text constructs in the feed (things like <code>title</code>, <code>summary</code>, and <code>content</code>). Other elements in the feed (most notably, <code>link</code> are not affected by this value.
<ul style="margin:0">
<li><code>feed_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found either in the enclosed <code>source</code> or enclosing <code>feed</code> element.</li>
<li><code>entry_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found in this entry.</li>
<li>Any other value will be treated as a <a href="http://www.ietf.org/rfc/rfc3986.txt">URI reference</a>. These values may be relative or absolute. If relative, the <code>xml:base</code> values in each text construct will each be adjusted separately using to the specified value.</li>
</ul>
</li>
</ul> </ul>
</body> </body>
</html> </html>
...@@ -30,5 +30,7 @@ def getLogger(level, format): ...@@ -30,5 +30,7 @@ def getLogger(level, format):
return logger return logger
# Configure feed parser
from planet import feedparser
feedparser.SANITIZE_HTML=0
feedparser.RESOLVE_RELATIVE_URIS=0
...@@ -125,6 +125,7 @@ def __init__(): ...@@ -125,6 +125,7 @@ def __init__():
define_tmpl('summary_type', '') define_tmpl('summary_type', '')
define_tmpl('content_type', '') define_tmpl('content_type', '')
define_tmpl('future_dates', 'keep') define_tmpl('future_dates', 'keep')
define_tmpl('xml_base', '')
def load(config_file): def load(config_file):
""" initialize and load a configuration""" """ initialize and load a configuration"""
......
...@@ -11,7 +11,7 @@ Recommended: Python 2.3 or later ...@@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
""" """
__version__ = "4.2-pre-" + "$Revision: 1.147 $"[11:16] + "-cvs" __version__ = "4.2-pre-" + "$Revision: 1.149 $"[11:16] + "-cvs"
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification, Redistribution and use in source and binary forms, with or without modification,
...@@ -65,6 +65,14 @@ TIDY_MARKUP = 0 ...@@ -65,6 +65,14 @@ TIDY_MARKUP = 0
# if TIDY_MARKUP = 1 # if TIDY_MARKUP = 1
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
# If you want feedparser to automatically resolve all relative URIs, set this
# to 1.
RESOLVE_RELATIVE_URIS = 1
# If you want feedparser to automatically sanitize all potentially unsafe
# HTML content, set this to 1.
SANITIZE_HTML = 1
# ---------- required modules (should come with any Python distribution) ---------- # ---------- required modules (should come with any Python distribution) ----------
import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
try: try:
...@@ -732,7 +740,7 @@ class _FeedParserMixin: ...@@ -732,7 +740,7 @@ class _FeedParserMixin:
is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
# resolve relative URIs within embedded markup # resolve relative URIs within embedded markup
if is_htmlish: if is_htmlish and RESOLVE_RELATIVE_URIS:
if element in self.can_contain_relative_uris: if element in self.can_contain_relative_uris:
output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
...@@ -753,7 +761,7 @@ class _FeedParserMixin: ...@@ -753,7 +761,7 @@ class _FeedParserMixin:
self._getContext()['vcard'] = vcard self._getContext()['vcard'] = vcard
# sanitize embedded markup # sanitize embedded markup
if is_htmlish: if is_htmlish and SANITIZE_HTML:
if element in self.can_contain_dangerous_markup: if element in self.can_contain_dangerous_markup:
output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html')) output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
......
...@@ -6,6 +6,7 @@ Process a set of configuration defined sanitations on a given feed. ...@@ -6,6 +6,7 @@ Process a set of configuration defined sanitations on a given feed.
import time import time
# Planet modules # Planet modules
import planet, config, shell import planet, config, shell
from planet import feedparser
type_map = {'text': 'text/plain', 'html': 'text/html', type_map = {'text': 'text/plain', 'html': 'text/html',
'xhtml': 'application/xhtml+xml'} 'xhtml': 'application/xhtml+xml'}
...@@ -92,3 +93,40 @@ def scrub(feed_uri, data): ...@@ -92,3 +93,40 @@ def scrub(feed_uri, data):
or entry['published_parsed'] <= now) and or entry['published_parsed'] <= now) and
(not entry.has_key('updated_parsed') or not entry['updated_parsed'] (not entry.has_key('updated_parsed') or not entry['updated_parsed']
or entry['updated_parsed'] <= now)] or entry['updated_parsed'] <= now)]
scrub_xmlbase = config.xml_base(feed_uri)
# resolve relative URIs and sanitize
for entry in data.entries + [data.feed]:
for key in entry.keys():
if key == 'content':
node = entry.content[0]
elif key.endswith('_detail'):
node = entry[key]
else:
continue
if not node.has_key('type'): continue
if not 'html' in node['type']: continue
if not node.has_key('value'): continue
if node.has_key('base'):
if scrub_xmlbase:
if scrub_xmlbase == 'feed_alternate':
if entry.has_key('source') and \
entry.source.has_key('link'):
node['base'] = entry.source.link
elif data.feed.has_key('link'):
node['base'] = data.feed.link
elif scrub_xmlbase == 'entry_alternate':
if entry.has_key('link'):
node['base'] = entry.link
else:
node['base'] = feedparser._urljoin(
node['base'], scrub_xmlbase)
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
node['value'] = feedparser._sanitizeHTML(
node.value, 'utf-8', node.type)
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import unittest, os, sys, glob, new, re, StringIO, time import unittest, os, sys, glob, new, re, StringIO, time
from planet import feedparser from planet import feedparser
from planet.reconstitute import reconstitute from planet.reconstitute import reconstitute
from planet.scrub import scrub
testfiles = 'tests/data/reconstitute/%s.xml' testfiles = 'tests/data/reconstitute/%s.xml'
...@@ -23,6 +24,7 @@ class ReconstituteTest(unittest.TestCase): ...@@ -23,6 +24,7 @@ class ReconstituteTest(unittest.TestCase):
# parse and reconstitute to a string # parse and reconstitute to a string
work = StringIO.StringIO() work = StringIO.StringIO()
results = feedparser.parse(data) results = feedparser.parse(data)
scrub(testfiles%name, results)
reconstitute(results, results.entries[0]).writexml(work) reconstitute(results, results.entries[0]).writexml(work)
# verify the results # verify the results
......
...@@ -6,7 +6,7 @@ from planet.scrub import scrub ...@@ -6,7 +6,7 @@ from planet.scrub import scrub
from planet import feedparser, config from planet import feedparser, config
feed = ''' feed = '''
<feed xmlns='http://www.w3.org/2005/Atom'> <feed xmlns='http://www.w3.org/2005/Atom' xml:base="http://example.com/">
<author><name>F&amp;ouml;o</name></author> <author><name>F&amp;ouml;o</name></author>
<entry xml:lang="en"> <entry xml:lang="en">
<id>ignoreme</id> <id>ignoreme</id>
...@@ -15,7 +15,9 @@ feed = ''' ...@@ -15,7 +15,9 @@ feed = '''
<title>F&amp;ouml;o</title> <title>F&amp;ouml;o</title>
<summary>F&amp;ouml;o</summary> <summary>F&amp;ouml;o</summary>
<content>F&amp;ouml;o</content> <content>F&amp;ouml;o</content>
<link href="http://example.com/entry/1/"/>
<source> <source>
<link href="http://example.com/feed/"/>
<author><name>F&amp;ouml;o</name></author> <author><name>F&amp;ouml;o</name></author>
</source> </source>
</entry> </entry>
...@@ -82,3 +84,33 @@ class ScrubTest(unittest.TestCase): ...@@ -82,3 +84,33 @@ class ScrubTest(unittest.TestCase):
data = deepcopy(base) data = deepcopy(base)
scrub('testfeed', data) scrub('testfeed', data)
self.assertEqual(0, len(data.entries)) self.assertEqual(0, len(data.entries))
def test_scrub_xmlbase(self):
base = feedparser.parse(feed)
self.assertEqual('http://example.com/',
base.entries[0].title_detail.base)
config.parser.readfp(StringIO.StringIO(configData))
config.parser.set('testfeed', 'xml_base', 'feed_alternate')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.com/feed/',
data.entries[0].title_detail.base)
config.parser.set('testfeed', 'xml_base', 'entry_alternate')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.com/entry/1/',
data.entries[0].title_detail.base)
config.parser.set('testfeed', 'xml_base', 'base/')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.com/base/',
data.entries[0].title_detail.base)
config.parser.set('testfeed', 'xml_base', 'http://example.org/data/')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.org/data/',
data.entries[0].title_detail.base)
...@@ -35,7 +35,7 @@ ...@@ -35,7 +35,7 @@
<th>Name</th> <th>Name</th>
<th>Format</th> <th>Format</th>
<xsl:if test="//planet:ignore_in_feed | //planet:filters | <xsl:if test="//planet:ignore_in_feed | //planet:filters |
//planet:*[contains(local-name(),'_type')]"> //planet:xml_base | //planet:*[contains(local-name(),'_type')]">
<th>Notes</th> <th>Notes</th>
</xsl:if> </xsl:if>
</tr> </tr>
...@@ -128,12 +128,12 @@ ...@@ -128,12 +128,12 @@
</a> </a>
</td> </td>
<td><xsl:value-of select="planet:format"/></td> <td><xsl:value-of select="planet:format"/></td>
<xsl:if test="planet:ignore_in_feed | planet:filters | <xsl:if test="planet:ignore_in_feed | planet:filters | planet:xml_base |
planet:*[contains(local-name(),'_type')]"> planet:*[contains(local-name(),'_type')]">
<td> <td>
<dl> <dl>
<xsl:for-each select="planet:ignore_in_feed | planet:filters | <xsl:for-each select="planet:ignore_in_feed | planet:filters |
planet:*[contains(local-name(),'_type')]"> planet:xml_base | planet:*[contains(local-name(),'_type')]">
<xsl:sort select="local-name()"/> <xsl:sort select="local-name()"/>
<dt><xsl:value-of select="local-name()"/></dt> <dt><xsl:value-of select="local-name()"/></dt>
<dd><xsl:value-of select="."/></dd> <dd><xsl:value-of select="."/></dd>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment