Commit a61a15f8 authored by Ricardo J. Mendez's avatar Ricardo J. Mendez

Filtering out ignored sites on data import

parent a47d86c9
...@@ -29,8 +29,9 @@ ...@@ -29,8 +29,9 @@
(def relevant-tab-keys [:windowId :id :active :url :start-time :title :favIconUrl]) (def relevant-tab-keys [:windowId :id :active :url :start-time :title :favIconUrl])
(def select-tab-keys #(select-keys % relevant-tab-keys)) (def select-tab-keys #(select-keys % relevant-tab-keys))
(defn now [] (.now js/Date)) (def default-ignore-set #{"localhost" "newtab"})
(defn now [] (.now js/Date))
;;;;------------------------------------- ;;;;-------------------------------------
...@@ -137,23 +138,25 @@ ...@@ -137,23 +138,25 @@
(register-handler (register-handler
:data-load :data-load
(fn [app-state [_ loaded]] (fn [app-state [_ loaded]]
(let [migrated (migrations/migrate-to-latest loaded) (let [migrated (migrations/migrate-to-latest loaded)
t (now) t (now)
new-urls (-> ignore-set (or (:ignore-set migrated) default-ignore-set)
(:url-times migrated) new-urls (->
(data/time-clean-up (- t (* 7 ms-day)) 30) (:url-times migrated)
(data/time-clean-up (- t (* 14 ms-day)) 90) (data/clean-up-by-time (- t (* 7 ms-day)) 30)
(data/time-clean-up (- t (* 30 ms-day)) 300)) (data/clean-up-by-time (- t (* 14 ms-day)) 90)
site-data (:site-times migrated) (data/clean-up-by-time (- t (* 30 ms-day)) 300)
new-sites (if (not= new-urls (:url-times migrated)) (data/clean-up-ignored ignore-set))
(->> site-data (:site-times migrated)
;; Accumulate site times but preserve the icons we had before new-sites (if (not= new-urls (:url-times migrated))
(data/accumulate-site-times new-urls) (->>
(map #(vector (key %) ;; Accumulate site times but preserve the icons we had before
(assoc (val %) :icon (get-in site-data [(key %) :icon])))) (data/accumulate-site-times new-urls)
(into {})) (map #(vector (key %)
site-data) (assoc (val %) :icon (get-in site-data [(key %) :icon]))))
new-data (assoc migrated :url-times new-urls :site-times new-sites)] (into {}))
site-data)
new-data (assoc migrated :url-times new-urls :site-times new-sites :ignore-set ignore-set)]
; (console/trace "Data load" loaded "migrated" new-data) ; (console/trace "Data load" loaded "migrated" new-data)
;; Save the migrated data we just received ;; Save the migrated data we just received
(io/save new-data) (io/save new-data)
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
) )
(defn time-clean-up (defn clean-up-by-time
"Removes from url-times all the items that are older than cut-off-ts "Removes from url-times all the items that are older than cut-off-ts
and which were viewed for less than min-seconds" and which were viewed for less than min-seconds"
[url-times cut-off-ts min-seconds] [url-times cut-off-ts min-seconds]
...@@ -27,6 +27,14 @@ ...@@ -27,6 +27,14 @@
(< (:time (val %)) min-seconds)) (< (:time (val %)) min-seconds))
url-times))) url-times)))
(defn clean-up-ignored
"Removes from url-times all the items for which the domain
matches an ignore set"
[url-times ignore-set]
(into {} (remove #(contains? ignore-set (hostname (:url (val %))))
url-times))
)
(defn track-url-time (defn track-url-time
"Receives a url time database, a tab record and a time to track, and returns "Receives a url time database, a tab record and a time to track, and returns
new time database which is the result of adding the time to the URL. It also new time database which is the result of adding the time to the URL. It also
......
...@@ -338,10 +338,10 @@ ...@@ -338,10 +338,10 @@
) )
(deftest test-time-clean-up (deftest test-clean-up-by-time
(testing "Clean up date and minimum time are respected" (testing "Clean up date and minimum time are respected"
(let [min-date 1446028215913 (let [min-date 1446028215913
pruned (data/time-clean-up (:url-times test-db) min-date 30)] pruned (data/clean-up-by-time (:url-times test-db) min-date 30)]
(is pruned) (is pruned)
(is (= 5 (count pruned))) (is (= 5 (count pruned)))
;; We removed the right elements ;; We removed the right elements
...@@ -352,7 +352,7 @@ ...@@ -352,7 +352,7 @@
)) ))
(testing "Timestamp filtering is only on strictly greater than" (testing "Timestamp filtering is only on strictly greater than"
(let [min-date 1446114615912 (let [min-date 1446114615912
pruned (data/time-clean-up (:url-times test-db) min-date 30)] pruned (data/clean-up-by-time (:url-times test-db) min-date 30)]
(is pruned) (is pruned)
(is (= 3 (count pruned))) (is (= 3 (count pruned)))
;; getprismatic is still there ;; getprismatic is still there
...@@ -363,7 +363,7 @@ ...@@ -363,7 +363,7 @@
)) ))
(testing "Cut-off seconds are respected when filtering" (testing "Cut-off seconds are respected when filtering"
(let [min-date 1446114615912 (let [min-date 1446114615912
pruned (data/time-clean-up (:url-times test-db) min-date 28)] pruned (data/clean-up-by-time (:url-times test-db) min-date 28)]
(is pruned) (is pruned)
(is (= 4 (count pruned))) (is (= 4 (count pruned)))
;; getprismatic is still there ;; getprismatic is still there
...@@ -371,13 +371,34 @@ ...@@ -371,13 +371,34 @@
;; ... and we didn' lose splunk ;; ... and we didn' lose splunk
(is (get pruned (utils/url-key "http://splunk.com/")))) (is (get pruned (utils/url-key "http://splunk.com/"))))
(let [min-date 1446114615913 (let [min-date 1446114615913
pruned (data/time-clean-up (:url-times test-db) min-date 50)] pruned (data/clean-up-by-time (:url-times test-db) min-date 50)]
(is pruned) (is pruned)
(is (= 2 (count pruned))) (is (= 2 (count pruned)))
(is (= #{-327774960 -327358142} (is (= #{-327774960 -327358142}
(into #{} (keys pruned))))) (into #{} (keys pruned)))))
)) ))
(deftest test-clean-up-ignored
(let [url-times (:url-times test-db)]
(is (= url-times
(data/clean-up-ignored url-times #{}))
"Passing an empty set should not change things")
(is (= url-times
(data/clean-up-ignored url-times #{"localhost" "somedomain.com"}))
"Passing a set of not-matching domain does not change things")
;; Test removing a domain
(let [result (data/clean-up-ignored url-times #{"localhost" "numergent.com"})]
(is (= result (dissoc url-times -327774960 -526558523))
"We should have removed the numergent-associated urls")
(is (= 5 (count result))))
;; Test removing multiple domains
(let [result (data/clean-up-ignored url-times #{"localhost" "getprismatic.com" "numergent.com"})]
(is (= result (dissoc url-times -327774960 -526558523 1609181525))
"We should have removed the numergent-associated urls")
(is (= 4 (count result))))
))
(deftest test-accumulate-site-times (deftest test-accumulate-site-times
(testing "Accumulate site times creates a total but doesn't add favicons" (testing "Accumulate site times creates a total but doesn't add favicons"
(is (= (into {} (map #(vector (key %) (assoc (val %) :icon nil)) (is (= (into {} (map #(vector (key %) (assoc (val %) :icon nil))
...@@ -419,7 +440,7 @@ ...@@ -419,7 +440,7 @@
(deftest test-accumulate-after-clean-up (deftest test-accumulate-after-clean-up
(testing "We get a value accumulation per site time after clean up" (testing "We get a value accumulation per site time after clean up"
(let [min-date 1446114615912 (let [min-date 1446114615912
pruned (data/time-clean-up (:url-times test-db) min-date 30) pruned (data/clean-up-by-time (:url-times test-db) min-date 30)
site-times (data/accumulate-site-times pruned)] site-times (data/accumulate-site-times pruned)]
(is pruned) (is pruned)
(is (= {971841386 {:icon nil (is (= {971841386 {:icon nil
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment