Commit a61a15f8 authored by Ricardo J. Mendez's avatar Ricardo J. Mendez

Filtering out ignored sites on data import

parent a47d86c9
......@@ -29,8 +29,9 @@
(def relevant-tab-keys [:windowId :id :active :url :start-time :title :favIconUrl])
(def select-tab-keys #(select-keys % relevant-tab-keys))
(defn now [] (.now js/Date))
(def default-ignore-set #{"localhost" "newtab"})
(defn now [] (.now js/Date))
;;;;-------------------------------------
......@@ -139,11 +140,13 @@
(fn [app-state [_ loaded]]
(let [migrated (migrations/migrate-to-latest loaded)
t (now)
ignore-set (or (:ignore-set migrated) default-ignore-set)
new-urls (->
(:url-times migrated)
(data/time-clean-up (- t (* 7 ms-day)) 30)
(data/time-clean-up (- t (* 14 ms-day)) 90)
(data/time-clean-up (- t (* 30 ms-day)) 300))
(data/clean-up-by-time (- t (* 7 ms-day)) 30)
(data/clean-up-by-time (- t (* 14 ms-day)) 90)
(data/clean-up-by-time (- t (* 30 ms-day)) 300)
(data/clean-up-ignored ignore-set))
site-data (:site-times migrated)
new-sites (if (not= new-urls (:url-times migrated))
(->>
......@@ -153,7 +156,7 @@
(assoc (val %) :icon (get-in site-data [(key %) :icon]))))
(into {}))
site-data)
new-data (assoc migrated :url-times new-urls :site-times new-sites)]
new-data (assoc migrated :url-times new-urls :site-times new-sites :ignore-set ignore-set)]
; (console/trace "Data load" loaded "migrated" new-data)
;; Save the migrated data we just received
(io/save new-data)
......
......@@ -18,7 +18,7 @@
)
(defn time-clean-up
(defn clean-up-by-time
"Removes from url-times all the items that are older than cut-off-ts
and which were viewed for less than min-seconds"
[url-times cut-off-ts min-seconds]
......@@ -27,6 +27,14 @@
(< (:time (val %)) min-seconds))
url-times)))
(defn clean-up-ignored
"Removes from url-times all the items for which the domain
matches an ignore set"
[url-times ignore-set]
(into {} (remove #(contains? ignore-set (hostname (:url (val %))))
url-times))
)
(defn track-url-time
"Receives a url time database, a tab record and a time to track, and returns
new time database which is the result of adding the time to the URL. It also
......
......@@ -338,10 +338,10 @@
)
(deftest test-time-clean-up
(deftest test-clean-up-by-time
(testing "Clean up date and minimum time are respected"
(let [min-date 1446028215913
pruned (data/time-clean-up (:url-times test-db) min-date 30)]
pruned (data/clean-up-by-time (:url-times test-db) min-date 30)]
(is pruned)
(is (= 5 (count pruned)))
;; We removed the right elements
......@@ -352,7 +352,7 @@
))
(testing "Timestamp filtering is only on strictly greater than"
(let [min-date 1446114615912
pruned (data/time-clean-up (:url-times test-db) min-date 30)]
pruned (data/clean-up-by-time (:url-times test-db) min-date 30)]
(is pruned)
(is (= 3 (count pruned)))
;; getprismatic is still there
......@@ -363,7 +363,7 @@
))
(testing "Cut-off seconds are respected when filtering"
(let [min-date 1446114615912
pruned (data/time-clean-up (:url-times test-db) min-date 28)]
pruned (data/clean-up-by-time (:url-times test-db) min-date 28)]
(is pruned)
(is (= 4 (count pruned)))
;; getprismatic is still there
......@@ -371,13 +371,34 @@
;; ... and we didn' lose splunk
(is (get pruned (utils/url-key "http://splunk.com/"))))
(let [min-date 1446114615913
pruned (data/time-clean-up (:url-times test-db) min-date 50)]
pruned (data/clean-up-by-time (:url-times test-db) min-date 50)]
(is pruned)
(is (= 2 (count pruned)))
(is (= #{-327774960 -327358142}
(into #{} (keys pruned)))))
))
(deftest test-clean-up-ignored
(let [url-times (:url-times test-db)]
(is (= url-times
(data/clean-up-ignored url-times #{}))
"Passing an empty set should not change things")
(is (= url-times
(data/clean-up-ignored url-times #{"localhost" "somedomain.com"}))
"Passing a set of not-matching domain does not change things")
;; Test removing a domain
(let [result (data/clean-up-ignored url-times #{"localhost" "numergent.com"})]
(is (= result (dissoc url-times -327774960 -526558523))
"We should have removed the numergent-associated urls")
(is (= 5 (count result))))
;; Test removing multiple domains
(let [result (data/clean-up-ignored url-times #{"localhost" "getprismatic.com" "numergent.com"})]
(is (= result (dissoc url-times -327774960 -526558523 1609181525))
"We should have removed the numergent-associated urls")
(is (= 4 (count result))))
))
(deftest test-accumulate-site-times
(testing "Accumulate site times creates a total but doesn't add favicons"
(is (= (into {} (map #(vector (key %) (assoc (val %) :icon nil))
......@@ -419,7 +440,7 @@
(deftest test-accumulate-after-clean-up
(testing "We get a value accumulation per site time after clean up"
(let [min-date 1446114615912
pruned (data/time-clean-up (:url-times test-db) min-date 30)
pruned (data/clean-up-by-time (:url-times test-db) min-date 30)
site-times (data/accumulate-site-times pruned)]
(is pruned)
(is (= {971841386 {:icon nil
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment