Commit 842a9cf6 authored by Danny Freeman's avatar Danny Freeman

renamed hy-code directory to src

parent dba8b0c2
\ No newline at end of file
* tweet_getter
To use this script, simply invoke it with with the twitter name and output file name as arguments.
#+BEGIN_SRC bash
workon env-with-hy
hy tweet_getter.hy twitter-handle output-file-name
* transition_matrix
Uses tweet_tokenizer.hy and output files from tweet_getter.hy.
(import [transition_matrix [create-from-file]])
(def transition-matrix (create-from-file "path/to/text/file"))
* TODO Control
Create list of authors with 2+ books.
Take one of their books, tokenize it, hold onto those along with author's name.
Take exerts from the other books, and guess the authors names.
Then simply do the same with tweets.
Try it with caps preserved for not.
Try it with punctuation removed or not.
Try it with numbers->words.
Try it with words.
Choose the most effective method, or a combination of both.
This diff is collapsed.
(require [extensions [*]])
(import [extensions [*]])
(comment "TEST"
(import [math [log pow]])
(range -10 0)
(map (fn [x] (pow 2 x)))
(map log)
(+ [0])
(map (fn [x] {:probability x}))
(adjust-result-probabilities True)))
(defn adjust-result-probabilities [normalize? test-results]
(defn adjust []
(defn get-largest []
(->> test-results
(map (fn [x] (:probability x)))
(filter (fn [x] (!= x 0.0)))
((fn [f] (if (empty? f)
(reduce max f))))))
(setv largest (get-largest))
(defn result-mapper [result]
(setv p (:probability result))
(if (zero? p)
(merge result {:probability (/ largest p)})))
(if (zero? largest)
(->> test-results
(map result-mapper)
(if (not normalize?)
(defn calc-correct-guess-percentage [experiment-results]
(->> experiment-results
(map (fn [r] (:correct? r)))
(filter (fn [v] v))
((fn [l] (/ l (len experiment-results))))))
(import re)
(import pickle)
(import os)
(require [hy.contrib.loop [loop]])
(import [collections [Counter]])
;;; Macros ;;;
(defmacro map-l [f coll]
"Wraps a call to map in a call to list"
`(list (map ~f ~coll)))
(defmacro rest-l [coll]
"Wraps a call to rest in a call to list"
`(list (rest ~coll)))
(defmacro comment [&rest stuff]
(defmacro λ [args body]
`(fn ~args ~body))
;;; Functions ;;;
(defn list-equals [x y]
(= (Counter x) (Counter y)))
(defn None? [obj]
(is obj None))
(defn one? [n]
(= n 1))
(defn merge [&rest maps]
(->> maps
(reduce (fn [d1 d2] (merge-with (fn [x y] y) d1 d2)))))
(defn file-exists? [filename]
(.isfile (. os path) filename))
(defn regex-match? [regex-string s]
"(regex-match? regex string)"
(-> (.compile re regex-string)
(.match s)
(defn slurp [filename]
"Reads in a pickled object"
(.load pickle (open filename "rb")))
(defn spit [obj filename]
"Writes an object to a pickle file"
(.dump pickle obj (open filename "wb")))
(defn print-> [obj]
"Prints an object and then returns it for use in threading macros."
(print obj) obj)
(import [extensions [*]])
(require [extensions [*]])
(import [math [log]])
(require [hy.contrib.loop [loop]])
;; Functions ;;
(defn -get-single-probability [markov-chain first-word following-word]
(setv probs (.get markov-chain first-word None))
(if (None? probs)
(if (None? following-word)
(.get probs following-word 0))))
(defn get-average-probability [markov-chain tokenized-text]
"Gets the average probability that a tweet was generated by a certain matrix."
(defn update-acc [acc coll]
(+ acc (-get-single-probability markov-chain (first coll) (second coll))))
(loop [[probs 0] [remaining tokenized-text] [total 0]]
(if (empty? remaining)
(/ probs total)
(recur (update-acc probs remaining) (list (rest remaining)) (+ total 1)))))
(defn get-probability [markov-chain tokenized-text]
"Gets the probability that a tweet was generated by a certain matrix."
(loop [[acc 1] [remaining tokenized-text]]
(if (= 1 (len remaining))
(if (or (zero? acc)
(= acc 1))
(log acc)) ;; This represents the magnitude of a probability, which if not 1 or 0, will most likely be close to 0.
(recur (* acc (-get-single-probability markov-chain (first remaining) (second remaining)))
(list (rest remaining))))))
(defn get-percent-text-is-represented [markov-chain tokenized-text]
(defn transition-exists [t1 t2]
(or (= t2 None)
(and (in t1 markov-chain) (in t2 (get markov-chain t1)))))
(loop [[tokens tokenized-text] [matched-transitions 0]]
(if (empty? tokens)
(/ matched-transitions (len tokenized-text))
(recur (list (rest tokens))
(if (transition-exists (first tokens) (second tokens))
(inc matched-transitions)
(defn update-markov-chain [markov-chain token next-token]
(if (None? next-token)
(do (unless (in token markov-chain)
(assoc markov-chain token {}))
(setv current-token (get markov-chain token))
(if (in next-token current-token)
(assoc current-token next-token (inc (get current-token next-token)))
(assoc current-token next-token 1))
(defn normalize-markov-chain [markov-chain]
(for [word markov-chain]
(setv word-dict (get markov-chain word))
(setv total (reduce + (.values word-dict)))
(for [k word-dict]
(assoc word-dict k (/ (get word-dict k) total))))
(defn create-markov-chain-single-text [txt markov-chain &optional (normalize True)]
(for [i (range 0 (len txt))]
(if (= i (dec (len txt)))
(update-markov-chain markov-chain (get txt i) (get txt (inc i)))))
(if normalize
(normalize-markov-chain markov-chain)
(defn create-markov-chain [tokenized-text &optional (normalize True)]
"Creates a transition matrix. Normalizes the matrix by default. (all possible transitions from a word sum up to 1)"
(setv markov-chain {})
(for [tweet tokenized-text]
(setv markov-chain (create-markov-chain-single-text tweet markov-chain False)))
(if normalize
(normalize-markov-chain markov-chain)
;;(loop [[tweets tokenized-text] [transitions {}]]
;; (if (None? (first tweets))
;; (if normalize
;; (normalize-markov-chain transitions)
;; transitions)
;; (recur (rest tweets)
;; (create-markov-chain-single-text (first tweets) transitions False))))
(import [math [*]])
(import [sys [*]])
(defn test []
(setv small-vals (->> (range -50 -30)
(map (fn [i] (pow 10.456 i)))
(print "These are the really small values")
(print small-vals)
(print "they are ordered: " (= (sorted (.copy small-vals)) small-vals))
(setv largest (->> small-vals
(map (fn [i] (log i)))
(setv normalized-vals (->> small-vals
(map (fn [i] (/ largest (log i))))
(print "These are the values after their magnitude is accounted for")
(print normalized-vals)
(print "They are still in the correct order: "(= (sorted (.copy normalized-vals)) normalized-vals)))
(require extensions)
(require [hy.contrib.loop [loop]])
(require [hy.extra.anaphoric [*]])
(import re) ;; regular expressions
(import [extensions [*]])
;; Tokenizers ;;
(defn tokenize-by-char [text &optional [transformer (fn [s] s)]]
"Takes a full string of text and tokenizes it into a list of characters after applying the transformer to it."
(->> text
(defn tokenize-by-word [text &optional [transformer (fn [s] s)]]
"Takes a full string of text and tokenizes it into a list of words after applying the transformer to it."
(import [nltk.tokenize [word-tokenize]])
(->> text
(defn tokenize-tweet-by-word [text &optional [transformer (fn [s] s)]]
(import [nltk.tokenize [TweetTokenizer]])
(defn create-tokenizer []
(TweetTokenizer :strip-handles False
:reduce-len True))
(->> text
(.tokenize (create-tokenizer))))
(defn tokenize-tweet-by-char [text &optional [transformer (fn [s] s)]]
(->> (tokenize-tweet-by-word text transformer)
(.join " ")
;; Text transformers ;;
;; Creates a transformer that removes punctuation, converts to lower case
;; (build-text-transfomer strip-punctuation-transfomer (fn [t] (.lower t)))
(defn compose-text-transformers [transform-functions]
"Takes a list of functions, and returns a single function meant to be used in the tokenizing functions"
(fn [text]
(reduce (fn [transformed-text func] (func transformed-text)) transform-functions text)))
(defn strip-punctuation-transformer [text]
"This should probably be used with the whitespace normalizer"
(->> (.maketrans str "" "" "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~«»¨©¦§")
(.translate text)))
(defn strip-stopwords-and-whitespace-transformer [text]
(import [nltk.corpus [stopwords]])
(setv stops (.words stopwords "english"))
(->> (.split text)
(filter (fn [word] (not (in word stops))))
(.join " ")))
(defn lower-case-transformer [text]
(.lower text))
(defn normalize-url-transformer [text]
(.sub re "https?:\/\/\/[a-zA-Z0-9\-\.]{8}" "" text))
(defn normalize-handle-transformer [text]
(.sub re "(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)" "@TwitterHandle" text))
(defn normalize-whitespace-transformer [text]
(->> text
(.join " ")))
;; OLD ;;
(defn -replace-in-tweet [tweet-word-list substitutions]
"This must replace tweets based on what is in the substitutions"
(loop [[subs substitutions] [tweet tweet-word-list]]
(defn replacement-mapper [word]
(if (regex-match? (first ((first subs))) word)
(second (first subs))
(if (empty? subs)
(list tweet)
(recur (rest-l subs) (map replacement-mapper tweet)))))
(defn tokenize-tweet [tokenizer-fn tweet &optional [substitutions []]]
(-> (tokenizer-fn tweet)
(-replace-in-tweet substitutions)))
(defn tokenize-tweet-coll [tokenizer-fn tweet-coll &optional [substitutions []]]
(defn mapper [t]
(tokenize-tweet tokenizer-fn t substitutions))
(list (map mapper tweet-coll)))
(defn create-NLTK-tweet-tokenizer [tweet]
(import [nltk.tokenize [TweetTokenizer]])
(defn create-tokenizer []
(TweetTokenizer :preserve-case True
:strip-handles False
:reduce-len True))
(.tokenize (create-tokenizer) tweet))
(defn create-NLTK-char-tokenizer [tweet]
(.tokenize (CharTokenizer) (.lower tweet)))
(defn tokenize-coll-as-NLTK-tweet [tweet-coll &optional [substitutions []]]
(tokenize-tweet-coll create-NLTK-tweet-tokenizer tweet-coll substitutions))
(defn get-url-substitution []
["https?:\/\/\/[a-zA-Z0-9\-\.]{8}" ""])
(defn get-handle-substitution []
["(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)" "@TwitterHandle"])
(defn get-all-substitutions []
[(get-url-substitution) (get-handle-substitution)])
;; (defn tokenize-file [filename &optional [normalize-urls True] [normalize-handles True]]
;; "By default, will normalize URL's and Usernames in a tweet."
;; (setv replacements [(if normalize-urls (get-url-substitution))
;; (if normalize-handles (get-handle-substitution))])
;; (tokenize-coll-as-NLTK-tweet (-read-file filename) replacements))
;; (defn tokenize-tweet-default [tweet]
;; (tokenize-tweet create-NLTK-tweet-tokenizer tweet (get-all-substitutions)))
;; (defn tokenize-tweet-char [tweet]
;; (tokenize-tweet create-NLTK-char-tokenizer tweet (get-all-substitutions)))
(import [api_key [*]])
(import tweepy)
(import [extensions [*]])
(import [user-lists [*]])
(import pickle)
(import os.path)
(require [extensions [*]])
(require [hy.contrib.loop [*]])
;; Global Variables ;;
(def tweet-dir "tweets/journalists/")
(def handles journalist-twitter-accounts)
;; Functions ;;
(defn -create-pickled-tweets-name [handle]
(defn -make-path [directory filename extension]
(->> (+ directory filename extension)
(.abspath (. os path))))
(-make-path tweet-dir handle ".pickle"))
(defn -get-api []
"Gets an api object with this application's twitter keys"
(setv authenticator (.OAuthHandler tweepy (:ConsumerKey secret-keys)
(:ConsumerSecret secret-keys)))
(setv True)
(.set_access_token authenticator (:AccessToken secret-keys)
(:AccessTokenSecret secret-keys))
(.API tweepy
:auth_handler authenticator
:wait_on_rate_limit True
:wait_on_rate_limit_notify True))
(defn -extract-tweets-from-status [tweets]
"Given a list of tweepy.status objects, extracts tweet texts"
(->> tweets
(map (fn [t] (.encode t.text "utf-8")))
(defn -get-cursor [handle]
"Gets a cursor that iterates of all the tweets posted by screen-name"
(->> (.Cursor tweepy (. (-get-api) user_timeline)
:screen_name handle
:include_rts False)
(defn -get-tweets [handle]
"Uses a cursor with a limit handler to scrape tweets"
(loop [[tweets []] [cursor (-get-cursor handle)]]
(try (recur (cons (next cursor) tweets) cursor)
(except [e StopIteration]
(-extract-tweets-from-status tweets)))))
(defn -dump-tweets-to-file [tweets filename]
"Serializes a tweet collection in a file."
(-> (->> tweets
(map (fn [t] (.decode t "utf-8")))
(spit filename)))
(defn handle-exists? [handle]
(do (.get-user (-get-api) handle)
(except (tweepy.TweepError)
(defn scrape-and-pickle [handle &optional [redownload? False]]
"Scrapes and pickles data for a twitter handle."
(setv filename (-create-pickled-tweets-name handle))
(if (and (handle-exists? handle)
(or redownload?
(not (file-exists? filename))))
(print (+ "Downloading tweets for " handle))
(setv tweets (-get-tweets handle))
(-dump-tweets-to-file tweets filename))))
;; Mass Tweet Downloading Functions ;;
(defn scrape-and-pickle-handles [handle-list &optional [redownload? False]]
"Scrapes and pickles data for a list of twitter handles"
(->> (map (fn [h] (scrape-and-pickle h redownload?)) handle-list)
(defmain [&rest args]
(print "Downloading a lot of data, please wait . . .")
(scrape-and-pickle-handles handles (-> (second args) (str) (.lower) (= "true"))))
(import [extensions [*]])
(import [text-tokenization [*]])
(import [markov-chains [*]])
(import [experiment-helpers [*]])
(import [tweet-getter [tweet-dir handles]])
(import random)
(require [extensions [comment]])
(require [hy.extra.anaphoric [*]])
;; Global Variables ;;
(def -twitter-text-transformers
(compose-text-transformers [lower-case-transformer normalize-whitespace-transformer])
(compose-text-transformers [strip-stopwords-and-whitespace-transformer])
(compose-text-transformers [strip-stopwords-and-whitespace-transformer lower-case-transformer])
(compose-text-transformers [normalize-whitespace-transformer normalize-handle-transformer normalize-url-transformer])
(compose-text-transformers [normalize-whitespace-transformer normalize-handle-transformer normalize-url-transformer lower-case-transformer])
(compose-text-transformers [normalize-whitespace-transformer normalize-handle-transformer normalize-url-transformer strip-stopwords-and-whitespace-transformer])
(compose-text-transformers [normalize-whitespace-transformer normalize-handle-transformer normalize-url-transformer strip-stopwords-and-whitespace-transformer lower-case-transformer])])
;; Functions ;;
(defn -get-transformer [group-number]
;; Get the text-transformer for group-number
(get -twitter-text-transformers (dec group-number)))
(defn -read-all-pickled-tweets []
"Returns a map of handles and tweet collection"
(comment {"handle1" ["t1" "t2" "..."]
"handle2" ["t1" "t2" "..."]
"..." ["..."]})
(->> ;; real-twitter-accounts
;; tweet-dir from tweet-getter module
(map (fn [handle] [handle (+ tweet-dir handle ".pickle")]))
(map (fn [pair] {(first pair) (slurp (second pair))}))
(apply merge)))
;; This seems to be working, here's a test
(-tweets-to-markov-chain (get (-read-all-pickled-tweets) "dhh") (fn [s] s) tokenize-by-word))
(defn -tweets-to-markov-chain [tweets transformer tokenizer]
(->> (map transformer tweets)
(map tokenizer)
(defn -take-random-tweet [tweets]
(setv tweet (.choice random tweets))
(if (empty? tweet)
(-take-random-tweet tweets)
(setv r1 (-run-experiment (-read-all-pickled-tweets) 1 tokenize-by-char get-probability))
(setv handle (->> r1
(filter (fn [r] (= (:handle r) "realDonaldTrump")))
(->> handle (:tweets) (len))
(->> handle (:probabilities))
(->> handle (:best-guess))
(->> handle (:correct?))
(->> handle (:handle))
(->> handle (:random-tweet) (.join "")))
(defn -run-experiment [tweet-corpus group-num tokenizer probability-fn]
(setv transformer (-get-transformer group-num))
(defn handle-mapper [handle]
"Takes a twitter handle and creates a data structure containing the handle,
a random tweet, a collection of all the handle's tweets, and a Markov Chain
representing all the tweets."
(setv tweets (get tweet-corpus handle))
(setv random-tweet (->> (-take-random-tweet tweets)
(setv tweets (->> tweets
(remove (fn [t] (= t random-tweet)))
;; The returned data structure
{:handle handle
:random-tweet random-tweet
:tweets tweets
:markov-chain (-tweets-to-markov-chain tweets transformer tokenizer)})
;; Creates a complete list of the data structures created by handle-mapper
(setv twitter-markov-chain-map (->> tweet-corpus
(map handle-mapper)
(defn experiment-mapper [handle-map]
"Maps elements in twitter-markov-chain-map to a data structure similar to the one
created by handle mapper, but with the experiment results merged in. They are
:probabilities [{:handle handle1 :probability p1}
{:handle handle2 :probability p2}]
:best-guess {:handle n}
:correct? True/False"
(defn probability-mapper [e]
"Takes an entry from the twitter-markov-chain-map, and maps the probability
that each chain could generate the random tweet from handle-map"
{:handle (:handle e)
:probability (probability-fn (:markov-chain e) (:random-tweet handle-map))})
;; Calculate all the probabilities that a tweet could be produced from each Markov Chain
;; in the corpus
(setv probabilities (->> (map probability-mapper twitter-markov-chain-map)
(adjust-result-probabilities (= probability-fn get-probability))))
(defn best-guesser [p1 p2]
(if (>= (:probability p1) (:probability p2))
;; Find the handle with the greatest probability of generating the random tweet.
(setv best-guess (->> probabilities
(reduce best-guesser)))
;; Merge the results of the experiments with the handle map
(merge handle-map {:probabilities probabilities
:best-guess best-guess
:correct? (and (= (:handle best-guess) (:handle handle-map))
(not (zero? (:probability best-guess))))}))
;; Kick off the experiment
(->> twitter-markov-chain-map
(map experiment-mapper)
(comment (print-results tokenize-by-char get-probability))
(defn print-results [tokenizer probability-fn]
(setv tweet-corpus (-read-all-pickled-tweets))
(setv number-of-tests 10)
(setv n [1])