Commit 92049755 authored by Ricardo J. Mendez's avatar Ricardo J. Mendez

Merge branch 'feature/account-for-root-domain' into develop

Closes #5
parents 4eac6eab dd5a6697
Pipeline #13327384 passed with stage
in 1 minute and 20 seconds
...@@ -4,9 +4,9 @@ Relevance is a smart tab organizer for Chrome, written in ClojureScript. ...@@ -4,9 +4,9 @@ Relevance is a smart tab organizer for Chrome, written in ClojureScript.
It’ll create a natural arrangement where the tabs you have spent the longest on, which are expected to be the most relevant, are placed first, and the ones you haven’t read at all are shunted to the end of your list. It’ll create a natural arrangement where the tabs you have spent the longest on, which are expected to be the most relevant, are placed first, and the ones you haven’t read at all are shunted to the end of your list.
[You can read more about it here](https://numergent.com/relevance/). [You can read more about it here](https://numergent.com/relevance/), which includes a changelog.
This is Relevance 1.0.11-SNAPSHOT. This is Relevance 1.1.0-SNAPSHOT.
# Building # Building
......
(defproject relevance-chrome "1.0.11-SNAPSHOT" (defproject relevance-chrome "1.1.0-SNAPSHOT"
:license {:name "MIT License" :license {:name "MIT License"
:url "https://tldrlegal.com/license/mit-license"} :url "https://tldrlegal.com/license/mit-license"}
:dependencies [[org.clojure/clojure "1.8.0"] :dependencies [[org.clojure/clojure "1.8.0"]
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
[relevance.data :as data] [relevance.data :as data]
[relevance.io :as io] [relevance.io :as io]
[relevance.migrations :as migrations] [relevance.migrations :as migrations]
[relevance.order :refer [time-score score-tabs]] [relevance.order :refer [time-score sort-by-root]]
[relevance.utils :refer [on-channel url-key host-key hostname is-http? ms-day]] [relevance.utils :refer [on-channel url-key host-key hostname is-http? ms-day]]
[relevance.settings :refer [default-settings]] [relevance.settings :refer [default-settings]]
[khroma.alarms :as alarms] [khroma.alarms :as alarms]
...@@ -101,10 +101,10 @@ ...@@ -101,10 +101,10 @@
(go (go
(let [{:keys [settings data]} app-state (let [{:keys [settings data]} app-state
{:keys [url-times site-times]} data {:keys [url-times site-times]} data
tabs (score-tabs (:tabs (<! (windows/get window-id))) tabs (sort-by-root (:tabs (<! (windows/get window-id)))
url-times url-times
site-times site-times
settings)] settings)]
(doseq [tab tabs] (doseq [tab tabs]
(tabs/move (:id tab) {:index (:index tab)}))))) (tabs/move (:id tab) {:index (:index tab)})))))
......
(ns relevance.data (ns relevance.data
(:require [relevance.utils :refer [url-key is-http? host-key hostname]] "Contains functions related to data tracking and accumulation.
It does not account for the actual ordering based on this data. You should
see `relevance.order` for that."
(:require [relevance.utils :refer [url-key is-http? host-key hostname root]]
[khroma.log :as console])) [khroma.log :as console]))
...@@ -7,7 +10,14 @@ ...@@ -7,7 +10,14 @@
"Accumulates the total time for a site from a hashmap of URL times. "Accumulates the total time for a site from a hashmap of URL times.
Returns a hashmap with the URL ID as the key, and the :time, :icon and Returns a hashmap with the URL ID as the key, and the :time, :icon and
:host string on its value." :host string on its value.
This function differentiates between hostnames on the different root domain.
This means that docs.gitlab.com and gitlab.com are accumulated separately.
While we could lump them together into one at this point, keeping them
separately will allows us to apply the same weight to pages on the same
hostname, which will lead to more natural ordering."
[url-times] [url-times]
(->> (->>
(group-by #(hostname (:url %)) (vals url-times)) (group-by #(hostname (:url %)) (vals url-times))
...@@ -19,6 +29,19 @@ ...@@ -19,6 +29,19 @@
(into {}))) (into {})))
(defn accumulate-root-times
"Expects the hashmap resulting from `accumulate-site-times`, and returns a
new hashmap where times are accumulated by the root name.
This will let us prioritize the pages in the same root domain together,
while still keeping the per-site ordering."
[site-times]
(->> (group-by #(root (:host %)) (vals site-times))
(remove #(empty? (key %)))
(map #(vector (key %)
(apply + (map :time (val %)))))
(into {})))
(defn clean-up-by-time (defn clean-up-by-time
......
(ns relevance.order (ns relevance.order
(:require [relevance.utils :refer [on-channel url-key host-key hostname is-http? ms-day]])) (:require [relevance.utils :refer [on-channel url-key host-key hostname root is-http? ms-day]]
[relevance.data :refer [accumulate-root-times]]))
;;;;------------------------------------ ;;;;------------------------------------
...@@ -15,44 +16,77 @@ ...@@ -15,44 +16,77 @@
;;;;------------------------------------ ;;;;------------------------------------
(defn time-score (defn time-score
"Returns a score for a tab based on the total time spent at both a URL and "Returns map containing a score for a tab based on the total time spent at
the site the URL belongs to." both a URL and the site the URL belongs to, as well as a flag indicating
if it's a priority tab."
[tab url-times site-times settings] [tab url-times site-times settings]
(let [url (:url tab) (let [url (:url tab)
idx (:index tab) idx (:index tab)
url-time (or (:time (get url-times (url-key url))) url-time (or (:time (get url-times (url-key url)))
0) 0)
is-priority? (and (:sound-to-left? settings) priority? (and (:sound-to-left? settings)
(:audible tab)) (:audible tab))
is-penalized? (and (not (is-http? url)) penalized? (and (not (is-http? url))
(not is-priority?)) (not priority?))
tab-time (cond tab-time (cond
;; Add an extra score if it's a priority URL ;; Add an extra score if it's a priority URL
is-priority? (+ sound-extra-score idx) priority? (+ sound-extra-score idx)
;; If a URL is penalized, we want it to at least have a ;; If a URL is penalized, disregard the time
;; value of 1, otherwise the tab time gets ignored and ;; and use its index (it'll get penalized later)
;; we'd default to using the raw site time penalized? idx
is-penalized? (max url-time 1) ;; ... otherwise we just go with the raw URL time
;; ... otherwise we just go with the raw URL time :else url-time)
:else url-time) host-time (or (:time (get site-times (host-key (hostname url)))) 0)
site-time (or (:time (get site-times (host-key (hostname url)))) 0) total (+ tab-time host-time)
total (+ tab-time site-time) score (if penalized? (* total non-http-penalty) total)]
score (if is-penalized? (* total non-http-penalty) total)] ;; A tab without positive time in it will use its index as a small
(or (when (pos? tab-time) score) ;; offset to the host time. That way pages from the same host are
(- site-time idx)))) ;; lumped together, and new tabs are sorted by index.
{:score (or (when (pos? tab-time) score)
(+ host-time (* 0.1 idx)))
(defn score-tabs :priority? (not (or (nil? priority?)
"Returns a hashmap of the new tab ids and their indexes, based on a tab list and (false? priority?)))}))
the score function for time spent on urls and sites."
[tabs url-times site-times settings]
(->> tabs
(map #(assoc % :time (time-score % url-times site-times settings)))
(sort-by #(* -1 (:time %)))
(map-indexed #(hash-map :index %1
:id (:id %2)))))
(defn score-and-sort-simple
"Expects a group of tabs and associates a time score with each."
[tabs url-times site-times settings]
(->> tabs
(map #(merge % (time-score % url-times site-times settings)))
(sort-by #(- (:score %)))))
(defn sort-by-root
"Returns a hashmap of the new tab ids and their indexes, based on a tab list and
the score function for time spent on urls and sites."
[tabs url-times site-times settings]
(let [root-times (accumulate-root-times site-times)]
; (cljs.pprint/pprint tabs)
; (cljs.pprint/pprint root-times)
(->> tabs
;; We first group them by root hostname, sort the tab subgroups,
;; and then sort the groups by the time spend on the root.
(group-by #(root (hostname (:url %))))
(map (fn [[k v]]
(let [scored (score-and-sort-simple v url-times site-times settings)
root-time (get root-times k)
;; We may get a bunch of pages where the root time is 0.
;; In that case, let's sort them by their accumulated score.
idx (if (pos? root-time)
root-time
(apply + (map :score scored)))]
[(- idx)
scored])))
(sort-by first)
;; Discard the root names and just flatten the list
(map second)
flatten
;; Once we have done this, we will need to take priority tabs into
;; account. Those will break from their main group and appear first.
(map-indexed #(assoc %2 :index %1))
(sort-by #(if (:priority? %)
(- (:score %))
(:index %)))
;; Ready!
(map-indexed #(hash-map :index %1
:id (:id %2))))))
\ No newline at end of file
...@@ -52,6 +52,14 @@ ...@@ -52,6 +52,14 @@
lower-case lower-case
trim))) trim)))
(defn root
"Returns the root domain for a host"
[host]
(->> (string/split (lower-case (or host ""))
#"\.")
(take-last 2)
(string/join ".")))
(defn protocol (defn protocol
"Returns the protocol for a URL" "Returns the protocol for a URL"
[url] [url]
...@@ -67,7 +75,9 @@ ...@@ -67,7 +75,9 @@
(and (some? url) (and (some? url)
(some? (re-find #"\bhttps?:" (protocol url))))) (some? (re-find #"\bhttps?:" (protocol url)))))
(defn host-key [host] (defn host-key
"Returns a key for a hostname, or 0 if the hostname is empty"
[host]
(if (not-empty host) (if (not-empty host)
(hash-string (trim (lower-case host))) (hash-string (trim (lower-case host)))
0)) 0))
...@@ -90,12 +100,11 @@ ...@@ -90,12 +100,11 @@
"Split a string using commas, semi-colons or new lines, trims the resulting "Split a string using commas, semi-colons or new lines, trims the resulting
elements, and returns them as a set" elements, and returns them as a set"
[s] [s]
(->> (->> (string/split (lower-case (or s ""))
(string/split (or s "") #",|\n|;| ") #",|\n|;| ")
(map string/trim) (map string/trim)
(remove empty?) (remove empty?)
(map string/lower-case) (into #{})))
(into #{})))
(defn time-display (defn time-display
"Returns a display string for a number of seconds" "Returns a display string for a number of seconds"
......
...@@ -437,15 +437,22 @@ ...@@ -437,15 +437,22 @@
{:url "http://www.kitco.com/market/" {:url "http://www.kitco.com/market/"
:time 4 :time 4
:ts 1446051494575 :ts 1446051494575
:title "New York spot price Gold..."}
-24505671
{:url "http://kitco.com/market/"
:time 2
:ts 1446051494575
:title "New York spot price Gold..."}} :title "New York spot price Gold..."}}
acc (data/accumulate-site-times data)] acc (data/accumulate-site-times data)]
;; There should be no empty hostnames ;; There should be no empty hostnames
;; We check (get acc 0) because the result is indexed by the host-key, ;; We check (get acc 0) because the result is indexed by the host-key,
;; which returns 0 on nil or empty. ;; which returns 0 on nil or empty.
(is (nil? (get acc 0))) (is (nil? (get acc 0)))
;; Let's verify we got the right data ;; Let's verify we got the right data. Notice that accumulate-site-times
(is (= {971841386 {:time 39, :icon nil, :host "numergent.com"}, ;; does not take into account differences in the root domain.
-915908674 {:time 4, :icon nil, :host "www.kitco.com"}} (is (= {971841386 {:time 39 :icon nil :host "numergent.com"}
-915908674 {:time 4 :icon nil :host "www.kitco.com"}
996869973 {:time 2 :icon nil :host "kitco.com"}}
acc)))) acc))))
(testing "Accumulate site times disregards the port for the URL when accumulating" (testing "Accumulate site times disregards the port for the URL when accumulating"
(let [data {2080624698 (let [data {2080624698
...@@ -482,8 +489,40 @@ ...@@ -482,8 +489,40 @@
(is (= {971841386 {:time 27 :icon nil :host "numergent.com"} (is (= {971841386 {:time 27 :icon nil :host "numergent.com"}
-915908674 {:time 4 :icon nil :host "www.kitco.com"} -915908674 {:time 4 :icon nil :host "www.kitco.com"}
-1536293812 {:time 37 :icon nil :host "google.com"}} -1536293812 {:time 37 :icon nil :host "google.com"}}
acc)))) acc)))))
)
(deftest test-accumulate-root-times
(let [data {2080624698
{:url "/tags/khroma/"
:time 117
:ts 1445964037798
:title "Khroma articles"}
-526558523
{:url "https://numergent.com/opensource/"
:time 27
:ts 1445964037798
:title "Open source projects"}
-327774960
{:url "https://www.numergent.com/tags/khroma/"
:time 12
:ts 1445964037798
:title "Khroma articles"}
1917381154
{:url "http://www.kitco.com/market/"
:time 4
:ts 1446051494575
:title "New York spot price Gold..."}
-24505671
{:url "http://KITCO.com/market/"
:time 3
:ts 1446051494575
:title "New York spot price Gold..."}}
site-times (data/accumulate-site-times data)
root-times (data/accumulate-root-times site-times)]
(testing "Accumulate root times takes into lumps together pages on the same root"
(is (= {"numergent.com" 39
"kitco.com" 7}
root-times)))))
(deftest test-accumulate-after-clean-up (deftest test-accumulate-after-clean-up
......
This diff is collapsed.
...@@ -99,6 +99,17 @@ ...@@ -99,6 +99,17 @@
)) ))
(deftest test-root
(are [host name] (= (utils/root host) name)
"www.google.com" "google.com"
"WWW.google.COM" "google.com"
"some.sub.domain.com" "domain.com"
"localhost" "localhost"
"" ""
nil ""
))
(deftest test-protocol (deftest test-protocol
(are [url name] (= (utils/protocol url) name) (are [url name] (= (utils/protocol url) name)
"https://www.google.com" "https:" "https://www.google.com" "https:"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment