Commit 0116e95a authored by Malcolm Blaney's avatar Malcolm Blaney

Remove duplicate images from content in Reader->SaveItem. Added

support for media rss and multiple enclosures in rss feeds.
parent 8715b049
Pipeline #36357138 passed with stage
in 1 minute and 11 seconds
......@@ -475,6 +475,7 @@ class Reader extends Base {
'"",".reader-discovered-feed-url","color","#aaaaaa"',
'"",".reader-discovered-feed-url","font-size","0.8em"',
'"",".reader-reposted-by","font-size","0.8em"',
'"",".reader-reposted-by","margin-left","5px"',
'"",".h-card .ui-icon.ui-icon-person","display",' .
'"inline-block"'];
$this->AddSiteStyle($site_style);
......@@ -1229,10 +1230,6 @@ class Reader extends Base {
list($repost_name, $repost_url, $repost_photo) =
$this->LookupNickname($repost_author, $permalink);
$author .= '<span class="reader-reposted-by">reposted by ';
if ($repost_photo !== '') {
$author .= '<a href="' . $repost_url . '">' .
'<img class="thumb" src="' . $repost_photo . '"></a> ';
}
$author .= '<a href="' . $repost_url . '">' . $repost_name .'</a></span>';
}
......@@ -1600,6 +1597,39 @@ class Reader extends Base {
return $registered;
}
private function RemoveDuplicateImages($content) {
$all_video_list = [];
$all_image_list = [];
$duplicate_image_list = [];
$doc = new DOMDocument();
@$doc->loadHTML($content);
$xpath = new DOMXpath($doc);
foreach ($xpath->query('//img[@src]') as $img) {
$src = $img->getAttribute('src');
if (in_array($src, $all_image_list)) {
$duplicate_image_list[] = $src;
}
else {
$all_image_list[] = $src;
}
}
foreach ($xpath->query('//video[@src]') as $video) {
$all_video_list[] = $video->getAttribute('src');
}
// Remove duplicate images from content.
foreach ($duplicate_image_list as $remove) {
$regex = '/<img[^>]+' . preg_quote($remove, '/') . '[^>]+>/';
$content = preg_replace($regex, '', $content, 1);
}
// Remove anchors from content that match videos. These contain an image
// from the video, which is considered to be a duplicate image.
foreach ($all_video_list as $remove) {
$regex = '/<a[^>]+' . preg_quote($remove, '/') . '.+<\/a>/';
$content = preg_replace($regex, '', $content, 1);
}
return $content;
}
private function RemoveFeed($us_xml_url, $force = false) {
$error_count = 0;
$mysqli = connect_db();
......@@ -1915,7 +1945,7 @@ class Reader extends Base {
if (strpos($xml_url, 'https://twitter-atom.appspot.com') !== 0) {
$us_title = strip_tags($item->get_title());
}
$us_content = $item->get_description();
$us_content = $this->RemoveDuplicateImages($item->get_description());
// If title is the same as the start of the stripped version of content,
// set it to the empty string. Also need to check for newlines and extra
// spaces created because of the removed tags possibly having their own
......
<?php
// Dobrado Content Management System
// Copyright (C) 2017 Malcolm Blaney
// Copyright (C) 2018 Malcolm Blaney
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
......@@ -18,8 +18,9 @@
function page_modified($username, $page, $action) {
$timestamp = 0;
$mysqli = connect_db();
$query = 'SELECT timestamp FROM page_updates WHERE user = "'.$username.'" '.
'AND page = "'.$page.'" AND action = "'.$action.'"';
$query = 'SELECT timestamp FROM page_updates WHERE ' .
'user = "' . $username . '" AND page = "' . $page . '" AND ' .
'action = "' . $action . '"';
if ($result = $mysqli->query($query)) {
if ($page_updates = $result->fetch_assoc()) {
$timestamp = $page_updates['timestamp'];
......@@ -27,11 +28,11 @@ function page_modified($username, $page, $action) {
$result->close();
}
else {
log_db('page_modified: '.$mysqli->error);
log_db('page_modified: ' . $mysqli->error);
}
$mysqli->close();
header('Last-Modified: '.gmdate('D, d M Y H:i:s T', $timestamp));
header('Last-Modified: ' . gmdate('D, d M Y H:i:s T', $timestamp));
// Always report that the page has been modified if the header isn't set.
if (!isset($_SERVER['HTTP_IF_MODIFIED_SINCE'])) return true;
return $timestamp > strtotime($_SERVER['HTTP_IF_MODIFIED_SINCE']);
......@@ -57,34 +58,36 @@ function rss($username) {
$fancy_url = $user->config->FancyUrl();
$title = $user->config->TitleIncludesPage() ?
$user->config->Title().' - '.$page : $user->config->Title();
$user->config->Title() . ' - ' . $page : $user->config->Title();
$server = $user->config->ServerName();
$scheme = $user->config->Secure() ? 'https://' : 'http://';
$port = $user->config->Secure() ? '443' : '80';
$link = $scheme.$server;
$link = $scheme . $server;
if ($user->name !== 'admin') {
$link .= '/'.$user->name;
$link .= '/' . $user->name;
}
if ($page !== 'index') {
$link .= $fancy_url ? '/'.$page : '/index.php?page='.$page;
$link .= $fancy_url ? '/' . $page : '/index.php?page=' . $page;
}
echo '<?xml version="1.0" ?>'."\n".
'<rss version="2.0">'."\n".
"<channel>\n".
'<title>'.htmlspecialchars($title)."</title>\n".
'<link>'.$link."</link>\n".
"<description>Syndicated by Dobrado.</description>\n".
'<cloud domain="'.$server.'" port="'.$port.'" path="/php/cloud.php" '.
'registerProcedure="" protocol="http-post" />'."\n";
echo '<?xml version="1.0" ?>' . "\n" .
'<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">' . "\n" .
"<channel>\n" .
'<title>' . htmlspecialchars($title) . "</title>\n" .
'<link>' . $link . "</link>\n" .
"<description>Syndicated by Dobrado.</description>\n" .
'<cloud domain="' . $server . '" port="' . $port . '" ' .
'path="/php/cloud.php" registerProcedure="" protocol="http-post" />' .
"\n";
// The notify table specifies which modules have placed themselves in a feed.
// The feed list is intentionally left short, so that aggregators subscribing
// to updates have less to parse (it is up to them to store older posts).
// 24 hours is used to allow previously syndicated items to be updated.
$query = 'SELECT box_id, label FROM notify WHERE user = "'.$user->name.'" '.
'AND page = "'.$page.'" AND action = "'.$action.'" AND public = 1 AND '.
'timestamp > '.strtotime('-24 hours').' ORDER BY timestamp DESC';
$query = 'SELECT box_id, label FROM notify WHERE ' .
'user = "' . $user->name . '" AND page = "' . $page . '" AND ' .
'action = "' . $action . '" AND public = 1 AND ' .
'timestamp > ' . strtotime('-24 hours') . ' ORDER BY timestamp DESC';
$feed = [];
if ($result = $mysqli->query($query)) {
while ($notify = $result->fetch_assoc()) {
......@@ -93,14 +96,14 @@ function rss($username) {
$result->close();
}
else {
log_db('rss 1: '.$mysqli->error);
log_db('rss 1: ' . $mysqli->error);
}
for ($i = 0; $i < count($feed); $i++) {
// Modules that want to produce feeds need to match this table structure,
// and produce one feed item per box_id.
$query = 'SELECT title, description, author, category, enclosure, '.
'permalink, timestamp FROM '.$feed[$i]['label'].' WHERE '.
'user = "'.$user->name.'" AND box_id = '.$feed[$i]['box_id'];
$query = 'SELECT title, description, author, category, enclosure, ' .
'permalink, timestamp FROM ' . $feed[$i]['label'] . ' WHERE ' .
'user = "' . $user->name . '" AND box_id = ' . $feed[$i]['box_id'];
if ($result = $mysqli->query($query)) {
if ($item = $result->fetch_assoc()) {
$title = $item['title'];
......@@ -111,37 +114,46 @@ function rss($username) {
echo "<item>\n";
if ($title !== '') {
echo '<title>'.htmlspecialchars($title)."</title>\n";
echo '<title>' . htmlspecialchars($title) . "</title>\n";
}
$url = $item['permalink'];
if ($url !== '') {
if (!$fancy_url) {
$url = 'index.php?page='.$url;
$url = 'index.php?page=' . $url;
}
$permalink = $scheme.$server.'/';
$permalink .= $user->name === 'admin' ? $url : $user->name.'/'.$url;
echo '<link>'.$permalink."</link>\n";
$permalink = $scheme . $server . '/';
$permalink .= $user->name === 'admin' ? $url : $user->name . '/'.$url;
echo '<link>' . $permalink . "</link>\n";
}
if ($description !== '') {
echo '<description>'.htmlspecialchars($description).
echo '<description>' . htmlspecialchars($description) .
"</description>\n";
}
if ($item['author'] !== '') {
echo '<author>'.$item['author']."</author>\n";
echo '<author>' . $item['author'] . "</author>\n";
}
if ($item['category'] !== '') {
echo '<category>'.htmlspecialchars($item['category'])."</category>\n";
if (strpos($item['category'], ',') !== false) {
foreach (explode(',', $item['category']) as $category) {
echo '<category>' . htmlspecialchars($category) . "</category>\n";
}
}
else if ($item['category'] !== '') {
echo '<category>' . htmlspecialchars($item['category']) .
"</category>\n";
}
if ($item['enclosure'] !== '') {
echo '<enclosure>'.$item['enclosure']."</enclosure>\n";
$enclosure_list = json_decode($item['enclosure'], true);
if (is_array($enclosure_list)) {
foreach ($enclosure_list as $enclosure) {
echo '<media:content url="' . $enclosure . '"/>' . "\n";
}
}
echo '<pubDate>'.gmdate('F j Y g:ia T', $item['timestamp']).
echo '<pubDate>' . gmdate('F j Y g:ia T', $item['timestamp']) .
"</pubDate>\n</item>\n";
}
$result->close();
}
else {
log_db('rss 2: '.$mysqli->error);
log_db('rss 2: ' . $mysqli->error);
}
}
$mysqli->close();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment