Commit 65e50513 authored by Alex Castaño's avatar Alex Castaño

Improve metadata scraping

parent 07cd23ac
defmodule MoodleNet.MetadataScraper do
def fetch(url) when is_binary(url) do
with {:ok, data} <- Furlex.unfurl(url, follow_redirect: true) do
{:ok, format_data(data)}
{:ok, format_data(data, url)}
end
end
defp format_data(data) do
defp format_data(data, url) do
%{
title: title(data),
summary: summary(data),
image: image(data),
image: image(data, url),
embed_code: embed_code(data),
language: language(data),
author: author(data),
......@@ -20,20 +20,21 @@ defmodule MoodleNet.MetadataScraper do
defp title(data) do
(get(data, :facebook, "title") || get(data, :twitter, "title") || get(data, :oembed, "title") ||
get(data, :other, "title"))
get(data, :html, "title"))
|> only_first()
end
defp summary(data) do
(get(data, :facebook, "description") || get(data, :twitter, "description") ||
get(data, :other, "description"))
get(data, :html, "description"))
|> only_first()
end
defp image(data) do
defp image(data, original_url) do
(get(data, :facebook, "image") || get(data, :twitter, "image") ||
get(data, :other, "thumbnail_url"))
|> only_first()
|> fix_relative_url(original_url)
end
defp embed_code(data) do
......@@ -74,9 +75,21 @@ defmodule MoodleNet.MetadataScraper do
defp get(%{oembed: oembed}, :oembed, label),
do: Map.get(oembed, label)
defp get(data, :html, label),
do: Map.get(data.html, label)
defp get(data, :other, label),
do: Map.get(data.other, label)
defp only_first([head | _]), do: head
defp only_first(arg), do: arg
defp fix_relative_url("", _), do: nil
defp fix_relative_url(url, original_url) when is_binary(url) do
case URI.parse(url) do
%URI{host: nil} -> URI.merge(original_url, url) |> to_string()
_ -> url
end
end
defp fix_relative_url(nil, _), do: nil
end
......@@ -29,7 +29,7 @@
"faker": {:hex, :faker, "0.11.1", "0dcf151bef21cb27e289ae7418fd15c6a278dad676d5996b75d1d309b155c205", [:mix], [], "hexpm"},
"file_system": {:hex, :file_system, "0.2.6", "fd4dc3af89b9ab1dc8ccbcc214a0e60c41f34be251d9307920748a14bf41f1d3", [:mix], [], "hexpm"},
"floki": {:hex, :floki, "0.20.4", "be42ac911fece24b4c72f3b5846774b6e61b83fe685c2fc9d62093277fb3bc86", [:mix], [{:html_entities, "~> 0.4.0", [hex: :html_entities, repo: "hexpm", optional: false]}, {:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm"},
"furlex": {:git, "https://github.com/alexcastano/furlex", "3b77bb7b19f3cee5b2e03d37a997dcec1225d47a", []},
"furlex": {:git, "https://github.com/alexcastano/furlex", "30ddad592f0284ad99ba3ed294c08c15b488f923", []},
"gettext": {:hex, :gettext, "0.16.1", "e2130b25eebcbe02bb343b119a07ae2c7e28bd4b146c4a154da2ffb2b3507af2", [:mix], [], "hexpm"},
"hackney": {:hex, :hackney, "1.14.3", "b5f6f5dcc4f1fba340762738759209e21914516df6be440d85772542d4a5e412", [:rebar3], [{:certifi, "2.4.2", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
"html_entities": {:hex, :html_entities, "0.4.0", "f2fee876858cf6aaa9db608820a3209e45a087c5177332799592142b50e89a6b", [:mix], [], "hexpm"},
......
......@@ -17,4 +17,18 @@ defmodule MoodleNet.MetadataScraperTest do
assert data.summary
assert data.title == "¿Por qué la música de Harry Potter suena tan MÁGICA?"
end
@tag :external
test "get title" do
url = "https://en.wikibooks.org/wiki/Spanish"
assert {:ok, data} = Subject.fetch(url)
assert data.title == "Spanish - Wikibooks, open books for an open world"
end
@tag :external
test "fix relative image urls" do
url = "https://graphql.org/learn/schema/#interfaces"
assert {:ok, data} = Subject.fetch(url)
assert data.image == "https://graphql.org/img/og_image.png"
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment