Commit 314168c5 authored by Ricardo J. Mendez's avatar Ricardo J. Mendez

New accumulate-root-times function

parent dd8d442f
(ns relevance.data
(:require [relevance.utils :refer [url-key is-http? host-key hostname]]
"Contains functions related to data tracking and accumulation.
It does not account for the actual ordering based on this data. You should
see `relevance.order` for that."
(:require [relevance.utils :refer [url-key is-http? host-key hostname root]]
[khroma.log :as console]))
......@@ -7,7 +10,14 @@
"Accumulates the total time for a site from a hashmap of URL times.
Returns a hashmap with the URL ID as the key, and the :time, :icon and
:host string on its value."
:host string on its value.
This function differentiates between hostnames on the different root domain.
This means that docs.gitlab.com and gitlab.com are accumulated separately.
While we could lump them together into one at this point, keeping them
separately will allows us to apply the same weight to pages on the same
hostname, which will lead to more natural ordering."
[url-times]
(->>
(group-by #(hostname (:url %)) (vals url-times))
......@@ -19,6 +29,19 @@
(into {})))
(defn accumulate-root-times
"Expects the hashmap resulting from `accumulate-site-times`, and returns a
new hashmap where times are accumulated by the root name.
This will let us prioritize the pages in the same root domain together,
while still keeping the per-site ordering."
[site-times]
(->> (group-by #(root (:host %)) (vals site-times))
(remove #(empty? (key %)))
(map #(vector (key %)
(apply + (map :time (val %)))))
(into {})))
(defn clean-up-by-time
......
......@@ -35,11 +35,11 @@
is-penalized? (max url-time 1)
;; ... otherwise we just go with the raw URL time
:else url-time)
site-time (or (:time (get site-times (host-key (hostname url)))) 0)
total (+ tab-time site-time)
host-time (or (:time (get site-times (host-key (hostname url)))) 0)
total (+ tab-time host-time)
score (if is-penalized? (* total non-http-penalty) total)]
(or (when (pos? tab-time) score)
(- site-time idx))))
(- host-time idx))))
(defn score-tabs
......
......@@ -52,6 +52,14 @@
lower-case
trim)))
(defn root
"Returns the root domain for a host"
[host]
(->> (string/split (lower-case (or host ""))
#"\.")
(take-last 2)
(string/join ".")))
(defn protocol
"Returns the protocol for a URL"
[url]
......@@ -67,7 +75,9 @@
(and (some? url)
(some? (re-find #"\bhttps?:" (protocol url)))))
(defn host-key [host]
(defn host-key
"Returns a key for a hostname, or 0 if the hostname is empty"
[host]
(if (not-empty host)
(hash-string (trim (lower-case host)))
0))
......
......@@ -437,15 +437,22 @@
{:url "http://www.kitco.com/market/"
:time 4
:ts 1446051494575
:title "New York spot price Gold..."}
-24505671
{:url "http://kitco.com/market/"
:time 2
:ts 1446051494575
:title "New York spot price Gold..."}}
acc (data/accumulate-site-times data)]
;; There should be no empty hostnames
;; We check (get acc 0) because the result is indexed by the host-key,
;; which returns 0 on nil or empty.
(is (nil? (get acc 0)))
;; Let's verify we got the right data
(is (= {971841386 {:time 39, :icon nil, :host "numergent.com"},
-915908674 {:time 4, :icon nil, :host "www.kitco.com"}}
;; Let's verify we got the right data. Notice that accumulate-site-times
;; does not take into account differences in the root domain.
(is (= {971841386 {:time 39 :icon nil :host "numergent.com"}
-915908674 {:time 4 :icon nil :host "www.kitco.com"}
996869973 {:time 2 :icon nil :host "kitco.com"}}
acc))))
(testing "Accumulate site times disregards the port for the URL when accumulating"
(let [data {2080624698
......@@ -482,7 +489,41 @@
(is (= {971841386 {:time 27 :icon nil :host "numergent.com"}
-915908674 {:time 4 :icon nil :host "www.kitco.com"}
-1536293812 {:time 37 :icon nil :host "google.com"}}
acc))))
acc)))))
(deftest test-accumulate-root-times
(let [data {2080624698
{:url "/tags/khroma/"
:time 117
:ts 1445964037798
:title "Khroma articles"}
-526558523
{:url "https://numergent.com/opensource/"
:time 27
:ts 1445964037798
:title "Open source projects"}
-327774960
{:url "https://www.numergent.com/tags/khroma/"
:time 12
:ts 1445964037798
:title "Khroma articles"}
1917381154
{:url "http://www.kitco.com/market/"
:time 4
:ts 1446051494575
:title "New York spot price Gold..."}
-24505671
{:url "http://KITCO.com/market/"
:time 3
:ts 1446051494575
:title "New York spot price Gold..."}}
site-times (data/accumulate-site-times data)
root-times (data/accumulate-root-times site-times)]
(testing "Accumulate root times takes into lumps together pages on the same root"
(is (= {"numergent.com" 39
"kitco.com" 7}
root-times))))
)
......
......@@ -99,6 +99,17 @@
))
(deftest test-root
(are [host name] (= (utils/root host) name)
"www.google.com" "google.com"
"WWW.google.COM" "google.com"
"some.sub.domain.com" "domain.com"
"localhost" "localhost"
"" ""
nil ""
))
(deftest test-protocol
(are [url name] (= (utils/protocol url) name)
"https://www.google.com" "https:"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment