Commit 92049755 authored by Ricardo J. Mendez's avatar Ricardo J. Mendez

Merge branch 'feature/account-for-root-domain' into develop

Closes #5
parents 4eac6eab dd5a6697
Pipeline #13327384 passed with stage
in 1 minute and 20 seconds
......@@ -4,9 +4,9 @@ Relevance is a smart tab organizer for Chrome, written in ClojureScript.
It’ll create a natural arrangement where the tabs you have spent the longest on, which are expected to be the most relevant, are placed first, and the ones you haven’t read at all are shunted to the end of your list.
[You can read more about it here](https://numergent.com/relevance/).
[You can read more about it here](https://numergent.com/relevance/), which includes a changelog.
This is Relevance 1.0.11-SNAPSHOT.
This is Relevance 1.1.0-SNAPSHOT.
# Building
......
(defproject relevance-chrome "1.0.11-SNAPSHOT"
(defproject relevance-chrome "1.1.0-SNAPSHOT"
:license {:name "MIT License"
:url "https://tldrlegal.com/license/mit-license"}
:dependencies [[org.clojure/clojure "1.8.0"]
......
......@@ -4,7 +4,7 @@
[relevance.data :as data]
[relevance.io :as io]
[relevance.migrations :as migrations]
[relevance.order :refer [time-score score-tabs]]
[relevance.order :refer [time-score sort-by-root]]
[relevance.utils :refer [on-channel url-key host-key hostname is-http? ms-day]]
[relevance.settings :refer [default-settings]]
[khroma.alarms :as alarms]
......@@ -101,10 +101,10 @@
(go
(let [{:keys [settings data]} app-state
{:keys [url-times site-times]} data
tabs (score-tabs (:tabs (<! (windows/get window-id)))
url-times
site-times
settings)]
tabs (sort-by-root (:tabs (<! (windows/get window-id)))
url-times
site-times
settings)]
(doseq [tab tabs]
(tabs/move (:id tab) {:index (:index tab)})))))
......
(ns relevance.data
(:require [relevance.utils :refer [url-key is-http? host-key hostname]]
"Contains functions related to data tracking and accumulation.
It does not account for the actual ordering based on this data. You should
see `relevance.order` for that."
(:require [relevance.utils :refer [url-key is-http? host-key hostname root]]
[khroma.log :as console]))
......@@ -7,7 +10,14 @@
"Accumulates the total time for a site from a hashmap of URL times.
Returns a hashmap with the URL ID as the key, and the :time, :icon and
:host string on its value."
:host string on its value.
This function differentiates between hostnames on the different root domain.
This means that docs.gitlab.com and gitlab.com are accumulated separately.
While we could lump them together into one at this point, keeping them
separately will allows us to apply the same weight to pages on the same
hostname, which will lead to more natural ordering."
[url-times]
(->>
(group-by #(hostname (:url %)) (vals url-times))
......@@ -19,6 +29,19 @@
(into {})))
(defn accumulate-root-times
"Expects the hashmap resulting from `accumulate-site-times`, and returns a
new hashmap where times are accumulated by the root name.
This will let us prioritize the pages in the same root domain together,
while still keeping the per-site ordering."
[site-times]
(->> (group-by #(root (:host %)) (vals site-times))
(remove #(empty? (key %)))
(map #(vector (key %)
(apply + (map :time (val %)))))
(into {})))
(defn clean-up-by-time
......
(ns relevance.order
(:require [relevance.utils :refer [on-channel url-key host-key hostname is-http? ms-day]]))
(:require [relevance.utils :refer [on-channel url-key host-key hostname root is-http? ms-day]]
[relevance.data :refer [accumulate-root-times]]))
;;;;------------------------------------
......@@ -15,44 +16,77 @@
;;;;------------------------------------
(defn time-score
"Returns a score for a tab based on the total time spent at both a URL and
the site the URL belongs to."
"Returns map containing a score for a tab based on the total time spent at
both a URL and the site the URL belongs to, as well as a flag indicating
if it's a priority tab."
[tab url-times site-times settings]
(let [url (:url tab)
idx (:index tab)
url-time (or (:time (get url-times (url-key url)))
0)
is-priority? (and (:sound-to-left? settings)
(:audible tab))
is-penalized? (and (not (is-http? url))
(not is-priority?))
tab-time (cond
;; Add an extra score if it's a priority URL
is-priority? (+ sound-extra-score idx)
;; If a URL is penalized, we want it to at least have a
;; value of 1, otherwise the tab time gets ignored and
;; we'd default to using the raw site time
is-penalized? (max url-time 1)
;; ... otherwise we just go with the raw URL time
:else url-time)
site-time (or (:time (get site-times (host-key (hostname url)))) 0)
total (+ tab-time site-time)
score (if is-penalized? (* total non-http-penalty) total)]
(or (when (pos? tab-time) score)
(- site-time idx))))
(defn score-tabs
"Returns a hashmap of the new tab ids and their indexes, based on a tab list and
the score function for time spent on urls and sites."
[tabs url-times site-times settings]
(->> tabs
(map #(assoc % :time (time-score % url-times site-times settings)))
(sort-by #(* -1 (:time %)))
(map-indexed #(hash-map :index %1
:id (:id %2)))))
(let [url (:url tab)
idx (:index tab)
url-time (or (:time (get url-times (url-key url)))
0)
priority? (and (:sound-to-left? settings)
(:audible tab))
penalized? (and (not (is-http? url))
(not priority?))
tab-time (cond
;; Add an extra score if it's a priority URL
priority? (+ sound-extra-score idx)
;; If a URL is penalized, disregard the time
;; and use its index (it'll get penalized later)
penalized? idx
;; ... otherwise we just go with the raw URL time
:else url-time)
host-time (or (:time (get site-times (host-key (hostname url)))) 0)
total (+ tab-time host-time)
score (if penalized? (* total non-http-penalty) total)]
;; A tab without positive time in it will use its index as a small
;; offset to the host time. That way pages from the same host are
;; lumped together, and new tabs are sorted by index.
{:score (or (when (pos? tab-time) score)
(+ host-time (* 0.1 idx)))
:priority? (not (or (nil? priority?)
(false? priority?)))}))
(defn score-and-sort-simple
"Expects a group of tabs and associates a time score with each."
[tabs url-times site-times settings]
(->> tabs
(map #(merge % (time-score % url-times site-times settings)))
(sort-by #(- (:score %)))))
(defn sort-by-root
"Returns a hashmap of the new tab ids and their indexes, based on a tab list and
the score function for time spent on urls and sites."
[tabs url-times site-times settings]
(let [root-times (accumulate-root-times site-times)]
; (cljs.pprint/pprint tabs)
; (cljs.pprint/pprint root-times)
(->> tabs
;; We first group them by root hostname, sort the tab subgroups,
;; and then sort the groups by the time spend on the root.
(group-by #(root (hostname (:url %))))
(map (fn [[k v]]
(let [scored (score-and-sort-simple v url-times site-times settings)
root-time (get root-times k)
;; We may get a bunch of pages where the root time is 0.
;; In that case, let's sort them by their accumulated score.
idx (if (pos? root-time)
root-time
(apply + (map :score scored)))]
[(- idx)
scored])))
(sort-by first)
;; Discard the root names and just flatten the list
(map second)
flatten
;; Once we have done this, we will need to take priority tabs into
;; account. Those will break from their main group and appear first.
(map-indexed #(assoc %2 :index %1))
(sort-by #(if (:priority? %)
(- (:score %))
(:index %)))
;; Ready!
(map-indexed #(hash-map :index %1
:id (:id %2))))))
\ No newline at end of file
......@@ -52,6 +52,14 @@
lower-case
trim)))
(defn root
"Returns the root domain for a host"
[host]
(->> (string/split (lower-case (or host ""))
#"\.")
(take-last 2)
(string/join ".")))
(defn protocol
"Returns the protocol for a URL"
[url]
......@@ -67,7 +75,9 @@
(and (some? url)
(some? (re-find #"\bhttps?:" (protocol url)))))
(defn host-key [host]
(defn host-key
"Returns a key for a hostname, or 0 if the hostname is empty"
[host]
(if (not-empty host)
(hash-string (trim (lower-case host)))
0))
......@@ -90,12 +100,11 @@
"Split a string using commas, semi-colons or new lines, trims the resulting
elements, and returns them as a set"
[s]
(->>
(string/split (or s "") #",|\n|;| ")
(map string/trim)
(remove empty?)
(map string/lower-case)
(into #{})))
(->> (string/split (lower-case (or s ""))
#",|\n|;| ")
(map string/trim)
(remove empty?)
(into #{})))
(defn time-display
"Returns a display string for a number of seconds"
......
......@@ -437,15 +437,22 @@
{:url "http://www.kitco.com/market/"
:time 4
:ts 1446051494575
:title "New York spot price Gold..."}
-24505671
{:url "http://kitco.com/market/"
:time 2
:ts 1446051494575
:title "New York spot price Gold..."}}
acc (data/accumulate-site-times data)]
;; There should be no empty hostnames
;; We check (get acc 0) because the result is indexed by the host-key,
;; which returns 0 on nil or empty.
(is (nil? (get acc 0)))
;; Let's verify we got the right data
(is (= {971841386 {:time 39, :icon nil, :host "numergent.com"},
-915908674 {:time 4, :icon nil, :host "www.kitco.com"}}
;; Let's verify we got the right data. Notice that accumulate-site-times
;; does not take into account differences in the root domain.
(is (= {971841386 {:time 39 :icon nil :host "numergent.com"}
-915908674 {:time 4 :icon nil :host "www.kitco.com"}
996869973 {:time 2 :icon nil :host "kitco.com"}}
acc))))
(testing "Accumulate site times disregards the port for the URL when accumulating"
(let [data {2080624698
......@@ -482,8 +489,40 @@
(is (= {971841386 {:time 27 :icon nil :host "numergent.com"}
-915908674 {:time 4 :icon nil :host "www.kitco.com"}
-1536293812 {:time 37 :icon nil :host "google.com"}}
acc))))
)
acc)))))
(deftest test-accumulate-root-times
(let [data {2080624698
{:url "/tags/khroma/"
:time 117
:ts 1445964037798
:title "Khroma articles"}
-526558523
{:url "https://numergent.com/opensource/"
:time 27
:ts 1445964037798
:title "Open source projects"}
-327774960
{:url "https://www.numergent.com/tags/khroma/"
:time 12
:ts 1445964037798
:title "Khroma articles"}
1917381154
{:url "http://www.kitco.com/market/"
:time 4
:ts 1446051494575
:title "New York spot price Gold..."}
-24505671
{:url "http://KITCO.com/market/"
:time 3
:ts 1446051494575
:title "New York spot price Gold..."}}
site-times (data/accumulate-site-times data)
root-times (data/accumulate-root-times site-times)]
(testing "Accumulate root times takes into lumps together pages on the same root"
(is (= {"numergent.com" 39
"kitco.com" 7}
root-times)))))
(deftest test-accumulate-after-clean-up
......
......@@ -6,82 +6,104 @@
(deftest test-time-score
;; A score for an unknown URL with no tab index is zero
(is (= 0 (order/time-score {:url "http://google.com"}
{}
{}
{})))
;; A score for an unknown URL with no tab index is the complement of its index
(is (= -20 (order/time-score {:url "http://google.com"
:index 20}
{}
{}
{})))
(is (= {:score 0 :priority? false}
(order/time-score {:url "http://google.com"}
{}
{}
{})))
;; A score for an unknown URL with no site time is 10% its index
(is (= {:score 2 :priority? false}
(order/time-score {:url "http://google.com"
:index 20}
{}
{}
{})))
;; A score for an unknown URL with a known site time gets 10% of its index
;; added to the domain time
(is (= {:score 125 :priority? false}
(order/time-score {:url "http://google.com/about"
:index 20}
{}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
;; A score for a known URL equals its time value
(is (= 291 (order/time-score {:url "http://google.com"}
{(utils/url-key "http://google.com")
{:time 291}}
{}
{})))
(is (= {:score 291 :priority? false}
(order/time-score {:url "http://google.com"}
{(utils/url-key "http://google.com")
{:time 291}}
{}
{})))
;; A score for a known URL equals its time value, even if it includes a port
(is (= 291 (order/time-score {:url "http://google.com:80"}
{(utils/url-key "http://google.com")
{:time 291}}
{}
{})))
(is (= {:score 291 :priority? false}
(order/time-score {:url "http://google.com:80"}
{(utils/url-key "http://google.com")
{:time 291}}
{}
{})))
;; A score for a known URL gets added the time for its site
(is (= 468 (order/time-score {:url "http://google.com/somepage"}
{(utils/url-key "http://google.com") {:time 291}
(utils/url-key "http://google.com/somepage") {:time 345}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
(is (= {:score 468 :priority? false}
(order/time-score {:url "http://google.com/somepage"}
{(utils/url-key "http://google.com") {:time 291}
(utils/url-key "http://google.com/somepage") {:time 345}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
;; A score for a known URL gets added the time for its host, even if it's using
;; a non-standard port
(is (= 123 (order/time-score {:url "http://google.com:9090/somepage"}
{(utils/url-key "http://google.com") {:time 291}
(utils/url-key "http://google.com/somepage") {:time 345}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
(is (= 448 (order/time-score {:url "http://google.com:9090/somepage"}
{(utils/url-key "http://google.com") {:time 291}
(utils/url-key "http://google.com/somepage") {:time 345}
(utils/url-key "http://google.com:9090/somepage") {:time 325}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
(is (= {:score 123 :priority? false}
(order/time-score {:url "http://google.com:9090/somepage"}
{(utils/url-key "http://google.com") {:time 291}
(utils/url-key "http://google.com/somepage") {:time 345}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
(is (= {:score 448 :priority? false}
(order/time-score {:url "http://google.com:9090/somepage"}
{(utils/url-key "http://google.com") {:time 291}
(utils/url-key "http://google.com/somepage") {:time 345}
(utils/url-key "http://google.com:9090/somepage") {:time 325}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
;; A score for a known URL is not affected by the score of other URLs for the same site
(is (= 414 (order/time-score {:url "http://google.com/"}
{(utils/url-key "http://google.com") {:time 291}
(utils/url-key "http://google.com/somepage") {:time 345}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
(is (= {:score 414 :priority? false}
(order/time-score {:url "http://google.com/"}
{(utils/url-key "http://google.com") {:time 291}
(utils/url-key "http://google.com/somepage") {:time 345}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
;; When there are no known URLs
;; A page inherits its site's score even if the page is unknown
(is (= 987 (order/time-score {:url "http://apple.com/mac"}
{(utils/url-key "http://google.com") {:time 291}
(utils/url-key "http://google.com/somepage") {:time 345}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
(is (= {:score 987 :priority? false}
(order/time-score {:url "http://apple.com/mac"}
{(utils/url-key "http://google.com") {:time 291}
(utils/url-key "http://google.com/somepage") {:time 345}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
;; A page inherits its site's score even if the page is unknown, but
;; substracts the index so that they are placed at the end.
(is (= 975 (order/time-score {:url "http://apple.com/mac" :index 12}
{(utils/url-key "http://google.com") {:time 291}
(utils/url-key "http://google.com/somepage") {:time 345}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
;; adds a percent of its index so that they are placed at the end.
(is (= {:score 990 :priority? false}
(order/time-score {:url "http://apple.com/mac" :index 30}
{(utils/url-key "http://google.com") {:time 291}
(utils/url-key "http://google.com/somepage") {:time 345}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{})))
;; A page that has sound gets no extra score if the :sound-to-left? key isn't on settings
(is (= 291 (order/time-score {:url "http://google.com"
:audible true}
{(utils/url-key "http://google.com")
{:time 291}}
{}
{})))
(is (= {:score 291 :priority? false}
(order/time-score {:url "http://google.com"
:audible true}
{(utils/url-key "http://google.com")
{:time 291}}
{}
{})))
;; An audible page gets an extra score if the :sound-to-left? key is set to true on the settings,
;; but based on the index, not the time spent
(is (= (+ 5 123 order/sound-extra-score)
(is (= {:score (+ 5 123 order/sound-extra-score)
:priority? true}
(order/time-score {:url "http://google.com/translate"
:index 5
:audible true}
......@@ -90,12 +112,13 @@
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{:sound-to-left? true})))
;; Non-http URLs are penalized, but get a minimum score of 1 (they will
;; likely have a time of 0 to begin with since they aren't tracked)
(is (= (* 124 order/non-http-penalty)
;; Non-http URLs are penalized, using their index as the score even if they had time tracked
(is (= {:score (* (+ 123 5) order/non-http-penalty)
:priority? false}
(order/time-score {:url "chrome://google.com/translate"
:index 5}
{}
{(utils/url-key "chrome://google.com/translate")
{:time 9001}}
{(utils/host-key "google.com") {:time 123}
(utils/host-key "apple.com") {:time 987}}
{}))))
......@@ -106,91 +129,152 @@
(utils/url-key "http://google.com/somepage") {:time 345}
(utils/url-key "http://apple.com/osx") {:time 10101}
(utils/url-key "http://apple.com/") {:time 2120}}
site-times {(utils/host-key "google.com") {:time 4295} ; Includes time from tabs which we have deleted
(utils/host-key "apple.com") {:time 12221}}]
site-times {(utils/host-key "google.com") {:time 4295 :host "google.com"} ; Includes time from tabs which we have deleted
(utils/host-key "apple.com") {:time 12221 :host "apple.com"}
(utils/host-key "support.apple.com") {:time 90 :host "support.apple.com"}}]
;; We have spent the longest at Apple, so it gets prioritized
;; The extension tab ends up at the end because it's not http
(is (= [{:index 0 :id 23} {:index 1 :id 9} {:index 2 :id 1}]
(order/score-tabs [{:url "http://google.com"
:id 9
:index 15}
{:url "chrome://extensions/"
:id 1
:index 1}
{:url "https://apple.com/macbook"
:id 23
:index 912}]
url-times
site-times
{})))
;; An unknown page ends up at the end
(is (= [{:index 0 :id 23} {:index 1 :id 9} {:index 2 :id 1} {:index 3 :id 2}]
(order/score-tabs [{:url "http://google.com"
:id 9
:index 15}
{:url "http://youtube.com/"
:id 2
:index 27}
{:url "chrome://extensions/"
:id 1
:index 1}
{:url "https://apple.com/macbook"
:id 23
:index 912}]
url-times
site-times
{})))
;; Two unknown pages get sorted by their index order
(is (= [{:index 0 :id 23} {:index 1 :id 9} {:index 2 :id 1} {:index 3 :id 123} {:index 4 :id 2}]
(order/score-tabs [{:url "http://google.com"
:id 9
:index 15}
{:url "http://youtube.com/"
:id 2
:index 27}
{:url "http://vimeo.com/"
:id 123
:index 26}
{:url "chrome://extensions/"
:id 1
:index 1}
{:url "https://apple.com/macbook"
:id 23
:index 912}]
url-times
site-times
{})))
;; An unknown page that is playing sound gets prioritized according to the settings
(is (= [{:index 0 :id 2} {:index 1 :id 23} {:index 2 :id 9} {:index 3 :id 1}]
(order/score-tabs [{:url "http://google.com"
:id 9
:index 15}
{:url "http://youtube.com/"
:id 2
:audible true
:index 27}
{:url "chrome://extensions/"
:id 1
:index 1}
{:url "https://apple.com/macbook"
:id 23
:index 912}]
url-times
site-times
{:sound-to-left? true})))
;; If for any reason the Apple URL is not http, it gets de-prioritized
(is (= [{:index 0 :id 9} {:index 1 :id 1} {:index 2 :id 23}]
(order/score-tabs [{:url "http://google.com"
:id 9
:index 15}
{:url "chrome://extensions/"
:id 1
:index 1}
{:url "apple.com/macbook"
:id 23
:index 912}]
url-times
site-times
{})))
(testing "Basic prioritization"
(is (= [{:index 0 :id 23} {:index 1 :id 9} {:index 2 :id 1}]
(order/sort-by-root [{:url "http://google.com"
:id 9
:index 15}
{:url "chrome://extensions/"
:id 1
:index 1}
{:url "https://apple.com/macbook"
:id 23
:index 912}]
url-times
site-times
{}))))
;; Even though we have not spent any time at the support.apple.com page,
;; it will get prioritized right after apple.com because they share a root domain
(testing "Pages get prioritized based on the root domain"
(is (= [{:index 0 :id 23} {:index 1 :id 3} {:index 2 :id 9} {:index 3 :id 1}]
(order/sort-by-root [{:url "http://google.com"
:id 9
:index 15}
{:url "chrome://extensions/"
:id 1
:index 1}
{:url "https://apple.com/macbook"
:id 23
:index 912}
{:url "https://support.apple.com/en-us/"
:id 3
:index 0}]
url-times
site-times
{}))))
;; All pages on the apple domain will end up together, even if one isn't known
(is (= [{:index 0 :id 23} {:index 1 :id 3} {:index 2 :id 300} {:index 3 :id 9} {:index 4 :id 1}]
(order/sort-by-root [{:url "http://google.com"
:id 9
:index 15}
{:url "chrome://extensions/"
:id 1
:index 1}
{:url "https://apple.com/macbook"
:id 23
:index 912}
{:url "https://support.apple.com/en-us/"
:id 3
:index 0}
{:url "https://icloud.apple.com/"
:id 300
:index 91}]
url-times
site-times
{})))
(testing "All pages on the apple domain will end up together, even if no pages whatsoever are known"
(is (= [{:index 0 :id 23} {:index 1 :id 3} {:index 2 :id 300} {:index 3 :id 9} {:index 4 :id 1}]
(order/sort-by-root [{:url "http://google.com"
:id 9
:index 15}
{:url "chrome://extensions/"
:id 1
:index 1}
{:url "https://apple.com/macbook"
:id 23
:index 912}
{:url "https://support.apple.com/en-us/"
:id 3
:index 0}
{:url "https://icloud.apple.com/"
:id 300
:index 91}]
{}
site-times
{}))))
(testing "An unknown page ends up at the end"
(is (= [{:index 0 :id 23} {:index 1 :id 9} {:index 2 :id 2} {:index 3 :id 1}]
(order/sort-by-root [{:url "http://google.com"
:id 9
:index 15}
{:url "http://youtube.com/"
:id 2
:index 27}
{:url "chrome://extensions/"
:id 1