Commit aad57f79 authored by Paolo Greppi's avatar Paolo Greppi

Add script to rewrite all external urls found in content and content_original...

Add script to rewrite all external urls found in content and content_original that exist as articles in the aggregator to internal links
parent 85b0d970
#!/usr/bin/env python3
# coding=utf-8
# Rewrites all external urls found in content and content_original that exist as articles in the aggregator to internal links
#
# sample invocation
# ./rewire.py 1143355
#
# This file is part of calo.news: A news platform
# Copyright (C) 2017-2019 Paolo Greppi
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
""" change the language of an article """
import time
import argparse
from bs4 import BeautifulSoup
import database as db
time_format = "%Y-%m-%dT%H:%M:%SZ"
def lookup(url):
with db.connect() as conn, conn.cursor() as cur:
cur.execute("SELECT id FROM articles WHERE url = %(url)s", {'url': url})
res = cur.fetchone()
id = res[0] if res else None
return id
def rewire(txt):
soup = BeautifulSoup(txt, 'lxml')
for a in soup.find_all('a', href=True):
url = a['href']
id = lookup(url)
if id:
new_url = '/article/%d' % id
print('%s -- rewiring link %s to %s' % (time.strftime(time_format, time.gmtime()), url, new_url))
a['href'] = new_url
return soup.prettify(formatter="html5")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Rewrite urls as internal links if possible')
parser.add_argument('id', type=int, help='article id')
args = parser.parse_args()
print('%s - rewiring article %d' % (time.strftime(time_format, time.gmtime()), args.id))
data = {'id': args.id}
with db.connect() as conn, conn.cursor() as cur:
cur.execute("""
SELECT
content,
content_original
FROM
articles
WHERE
id = %(id)s""", data)
res = cur.fetchone()
content = res[0]
content_original = res[1]
if content:
data['content'] = rewire(content)
cur.execute("UPDATE articles SET content=%(content)s WHERE id=%(id)s", data)
if content_original:
data['content_original'] = rewire(content_original)
cur.execute("UPDATE articles SET content_original=%(content_original)s WHERE id=%(id)s", data)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment