Improved article image detection
The current approach to detect article images is very simple, but cannot detect images in enclosure
or media:content
tags which don't have a .jpg
or .png
in the file name.
I'd like to suggest the following code which I now used since a few days and which significantly improves the image detection. Additionally, I didn't have any grey article images with it. It took about 2 weeks to fine-tune this and I hope you like it
"Ignore case" is set for all regular expressions; further comments are inline:
if (!feed.noImages) {
if (art.cover === undefined)
try {
// First try to get the xml element that can be used for an article image
let imagecontent =
item.getElementsByTagName("enclosure")[0] ||
item.getElementsByTagName("media:content")[0] ||
item.getElementsByTagName("media:thumbnail")[0];
// If such an xml element was detected, check if its 'type' or 'medium' attribute is for an image or check if the url contains .jpg or .png
if (
imagecontent &&
((imagecontent.hasAttribute("type") && imagecontent.getAttribute("type").includes("image")) ||
(imagecontent.hasAttribute("medium") && imagecontent.getAttribute("medium") === "image") ||
imagecontent.getAttribute("url").match(/\.(?:(?:jpe?g)|(?:png))/i))
) {
// If something was found that fits the conditions, set it as article image url
art.cover = imagecontent.getAttribute("url");
}
// If this first approach did not find an image, try to check the whole 'content:encoded' or if it does not exist the whole 'item'.
// Checking 'content:encoded' first is necessary, because there are feeds that contain advertisement images outside of 'content:encoded' which would be detected
// if only the 'item' is checked.
if (art.cover === undefined) {
let content = item.getElementsByTagName("content:encoded")[0]
? serializer.serializeToString(item.getElementsByTagName("content:encoded")[0])
: serializer.serializeToString(item);
// Try to match an <img...> tag or <img...> tag. For some reason even with < in the content string, a > is converted to >,
// perhaps because of the serializer?
// It needs to be done this way, otherwise feeds with mixed tags and entities cannot be matched properly.
// And it's also not possible to decode the content already here, because of feeds with mixed tags and entities (they exist...).
art.cover = content.match(/(<img[\w\W]+?)[\/]?(?:>)|(<img[\w\W]+?)[\/]?(?:>|>)/i)
? content
// If an <img...> or <img...> was found, take the first result...
.match(/(<img[\w\W]+?)[\/]?(?:>)|(<img[\w\W]+?)[\/]?(?:>|>)/i)[0]
//... check for the 'src' attribute...
.match(/(src=[\w\W]+?)[\/]?(?:>|>)/i)[1]
// And take the whole URL from the 'src' attribute
.match(/(https?:\/\/[^<>"']+?)[\n"'<]/i)[1]
// If no <img...> or <img...> was found, match for any url with .jpg/.png, but only allow a : before the first slash (e.g. if a port is defined)
// A : at the end might lead to a grey image, e.g. if a link to https://commons.wikimedia.org/wiki/File:Flag_of_Ukraine.jpg is in the feed.
: serializer
.serializeToString(item)
.match(/(https?:\/\/[^<>"'\/]+\/+[^<>"':]+?\.(?:(?:jpe?g)|(?:png)).*?)[\n"'<]/i)[1];
}
// If an image was detected, decode the url (because it could be from a match with html entities).
// Additionally replace http with https. It seems that the app cannot load content via http,
// it's also not possible to add a feed with http in the release version from F-Droid.
if (art.cover !== undefined) {
art.cover = decode(art.cover, { scope: "strict" }).replace("http://", "https://");
}
} catch {
/* dontcare */
}
} else art.cover = undefined;