Commit beac5c24 authored by Danny Freeman's avatar Danny Freeman

uploading project, migrated from github but removed git history

parents
#+TITLE: DHON Project
#+AUTHOR: Danny Freeman
This is my departmental honors thesis that I completed to eacn my Bachelor's degree in computer science.
See [[https://gitlab.com/dannyfreeman/DHON-Project/blob/master/thesis/thesis.org][thesis.org]] to learn about the project.
api_key.hy
\ No newline at end of file
* tweet_getter
To use this script, simply invoke it with with the twitter name and output file name as arguments.
#+BEGIN_SRC bash
workon env-with-hy
hy tweet_getter.hy twitter-handle output-file-name
#+END_SRC
* transition_matrix
Uses tweet_tokenizer.hy and output files from tweet_getter.hy.
#+BEGIN_SRC hy
(import [transition_matrix [create-from-file]])
(def transition-matrix (create-from-file "path/to/text/file"))
#+END_SRC
* TODO Control
Create list of authors with 2+ books.
Take one of their books, tokenize it, hold onto those along with author's name.
Take exerts from the other books, and guess the authors names.
Then simply do the same with tweets.
Try it with caps preserved for not.
Try it with punctuation removed or not.
Try it with numbers->words.
Try it with words.
Choose the most effective method, or a combination of both.
This diff is collapsed.
(require [extensions [*]])
(import [extensions [*]])
(comment "TEST"
(import [math [log pow]])
(->>
(range -10 0)
(map (fn [x] (pow 2 x)))
(print->)
(map log)
(list)
(+ [0])
(print->)
(map (fn [x] {:probability x}))
(list)
(adjust-result-probabilities True)))
(defn adjust-result-probabilities [normalize? test-results]
(defn adjust []
(defn get-largest []
(->> test-results
(map (fn [x] (:probability x)))
(filter (fn [x] (!= x 0.0)))
(list)
((fn [f] (if (empty? f)
0
(reduce max f))))))
(setv largest (get-largest))
(defn result-mapper [result]
(setv p (:probability result))
(if (zero? p)
result
(merge result {:probability (/ largest p)})))
(if (zero? largest)
test-results
(->> test-results
(map result-mapper)
(list))))
(if (not normalize?)
test-results
(adjust)))
(defn calc-correct-guess-percentage [experiment-results]
(->> experiment-results
(map (fn [r] (:correct? r)))
(filter (fn [v] v))
(list)
(len)
((fn [l] (/ l (len experiment-results))))))
(import re)
(import pickle)
(import os)
(require [hy.contrib.loop [loop]])
(import [collections [Counter]])
;;;;;;;;;;;;;;
;;; Macros ;;;
;;;;;;;;;;;;;;
(defmacro map-l [f coll]
"Wraps a call to map in a call to list"
`(list (map ~f ~coll)))
(defmacro rest-l [coll]
"Wraps a call to rest in a call to list"
`(list (rest ~coll)))
(defmacro comment [&rest stuff]
None)
(defmacro λ [args body]
`(fn ~args ~body))
;;;;;;;;;;;;;;;;;
;;; Functions ;;;
;;;;;;;;;;;;;;;;;
(defn list-equals [x y]
(= (Counter x) (Counter y)))
(defn None? [obj]
(is obj None))
(defn one? [n]
(= n 1))
(defn merge [&rest maps]
(->> maps
(reduce (fn [d1 d2] (merge-with (fn [x y] y) d1 d2)))))
(defn file-exists? [filename]
(.isfile (. os path) filename))
(defn regex-match? [regex-string s]
"(regex-match? regex string)"
(-> (.compile re regex-string)
(.match s)
(None?)
(not)))
(defn slurp [filename]
"Reads in a pickled object"
(.load pickle (open filename "rb")))
(defn spit [obj filename]
"Writes an object to a pickle file"
(.dump pickle obj (open filename "wb")))
(defn print-> [obj]
"Prints an object and then returns it for use in threading macros."
(print obj) obj)
(import [extensions [*]])
(require [extensions [*]])
(import [math [log]])
(require [hy.contrib.loop [loop]])
;;;;;;;;;;;;;;;
;; Functions ;;
;;;;;;;;;;;;;;;
(defn -get-single-probability [markov-chain first-word following-word]
(setv probs (.get markov-chain first-word None))
(if (None? probs)
0
(if (None? following-word)
1
(.get probs following-word 0))))
(defn get-average-probability [markov-chain tokenized-text]
"Gets the average probability that a tweet was generated by a certain matrix."
(defn update-acc [acc coll]
(+ acc (-get-single-probability markov-chain (first coll) (second coll))))
(loop [[probs 0] [remaining tokenized-text] [total 0]]
(if (empty? remaining)
(/ probs total)
(recur (update-acc probs remaining) (list (rest remaining)) (+ total 1)))))
(defn get-probability [markov-chain tokenized-text]
"Gets the probability that a tweet was generated by a certain matrix."
;; http://math.stackexchange.com/questions/409828/finding-the-probability-from-a-markov-chain-with-transition-matrix
;; https://www.math.ucdavis.edu/~daddel/linear_algebra_appl/Applications/MarkovChain/MarkovChain_9_18/node1.html
(loop [[acc 1] [remaining tokenized-text]]
(if (= 1 (len remaining))
(if (or (zero? acc)
(= acc 1))
acc
(log acc)) ;; This represents the magnitude of a probability, which if not 1 or 0, will most likely be close to 0.
(recur (* acc (-get-single-probability markov-chain (first remaining) (second remaining)))
(list (rest remaining))))))
(defn get-percent-text-is-represented [markov-chain tokenized-text]
(defn transition-exists [t1 t2]
(or (= t2 None)
(and (in t1 markov-chain) (in t2 (get markov-chain t1)))))
(loop [[tokens tokenized-text] [matched-transitions 0]]
(if (empty? tokens)
(/ matched-transitions (len tokenized-text))
(recur (list (rest tokens))
(if (transition-exists (first tokens) (second tokens))
(inc matched-transitions)
matched-transitions)))))
(defn update-markov-chain [markov-chain token next-token]
(if (None? next-token)
markov-chain
(do (unless (in token markov-chain)
(assoc markov-chain token {}))
(setv current-token (get markov-chain token))
(if (in next-token current-token)
(assoc current-token next-token (inc (get current-token next-token)))
(assoc current-token next-token 1))
markov-chain)))
(defn normalize-markov-chain [markov-chain]
(for [word markov-chain]
(setv word-dict (get markov-chain word))
(setv total (reduce + (.values word-dict)))
(for [k word-dict]
(assoc word-dict k (/ (get word-dict k) total))))
markov-chain)
(defn create-markov-chain-single-text [txt markov-chain &optional (normalize True)]
(for [i (range 0 (len txt))]
(if (= i (dec (len txt)))
None
(update-markov-chain markov-chain (get txt i) (get txt (inc i)))))
(if normalize
(normalize-markov-chain markov-chain)
markov-chain))
(defn create-markov-chain [tokenized-text &optional (normalize True)]
"Creates a transition matrix. Normalizes the matrix by default. (all possible transitions from a word sum up to 1)"
(setv markov-chain {})
(for [tweet tokenized-text]
(setv markov-chain (create-markov-chain-single-text tweet markov-chain False)))
(if normalize
(normalize-markov-chain markov-chain)
markov-chain)
;;(loop [[tweets tokenized-text] [transitions {}]]
;; (if (None? (first tweets))
;; (if normalize
;; (normalize-markov-chain transitions)
;; transitions)
;; (recur (rest tweets)
;; (create-markov-chain-single-text (first tweets) transitions False))))
)
(import [math [*]])
(import [sys [*]])
(defn test []
(setv small-vals (->> (range -50 -30)
(map (fn [i] (pow 10.456 i)))
(list)
(sorted)))
(print "These are the really small values")
(print small-vals)
(print "they are ordered: " (= (sorted (.copy small-vals)) small-vals))
(setv largest (->> small-vals
(map (fn [i] (log i)))
(last)))
(setv normalized-vals (->> small-vals
(map (fn [i] (/ largest (log i))))
(list)))
(print "These are the values after their magnitude is accounted for")
(print normalized-vals)
(print "They are still in the correct order: "(= (sorted (.copy normalized-vals)) normalized-vals)))
(test)
(require extensions)
(require [hy.contrib.loop [loop]])
(require [hy.extra.anaphoric [*]])
(import re) ;; regular expressions
(import [extensions [*]])
;;;;;;;;;;;;;;;;
;; Tokenizers ;;
;;;;;;;;;;;;;;;;
(defn tokenize-by-char [text &optional [transformer (fn [s] s)]]
"Takes a full string of text and tokenizes it into a list of characters after applying the transformer to it."
(->> text
(transformer)
(list)))
(defn tokenize-by-word [text &optional [transformer (fn [s] s)]]
"Takes a full string of text and tokenizes it into a list of words after applying the transformer to it."
(import [nltk.tokenize [word-tokenize]])
(->> text
(transformer)
(word-tokenize)))
(defn tokenize-tweet-by-word [text &optional [transformer (fn [s] s)]]
(import [nltk.tokenize [TweetTokenizer]])
(defn create-tokenizer []
(TweetTokenizer :strip-handles False
:reduce-len True))
(->> text
(transformer)
(.tokenize (create-tokenizer))))
(defn tokenize-tweet-by-char [text &optional [transformer (fn [s] s)]]
(->> (tokenize-tweet-by-word text transformer)
(.join " ")
(list)))
;;;;;;;;;;;;;;;;;;;;;;;
;; Text transformers ;;
;;;;;;;;;;;;;;;;;;;;;;;
;; Creates a transformer that removes punctuation, converts to lower case
;; (build-text-transfomer strip-punctuation-transfomer (fn [t] (.lower t)))
(defn compose-text-transformers [transform-functions]
"Takes a list of functions, and returns a single function meant to be used in the tokenizing functions"
(fn [text]
(reduce (fn [transformed-text func] (func transformed-text)) transform-functions text)))
(defn strip-punctuation-transformer [text]
"This should probably be used with the whitespace normalizer"
(->> (.maketrans str "" "" "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~«»¨©¦§")
(.translate text)))
(defn strip-stopwords-and-whitespace-transformer [text]
(import [nltk.corpus [stopwords]])
(setv stops (.words stopwords "english"))
(->> (.split text)
(filter (fn [word] (not (in word stops))))
(.join " ")))
(defn lower-case-transformer [text]
(.lower text))
(defn normalize-url-transformer [text]
(.sub re "https?:\/\/t.co\/[a-zA-Z0-9\-\.]{8}" "https://t.co/" text))
(defn normalize-handle-transformer [text]
(.sub re "(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)" "@TwitterHandle" text))
(defn normalize-whitespace-transformer [text]
(->> text
(.split)
(.join " ")))
;;;;;;;;;
;; OLD ;;
;;;;;;;;;
(defn -replace-in-tweet [tweet-word-list substitutions]
"This must replace tweets based on what is in the substitutions"
(loop [[subs substitutions] [tweet tweet-word-list]]
(defn replacement-mapper [word]
(if (regex-match? (first ((first subs))) word)
(second (first subs))
word))
(if (empty? subs)
(list tweet)
(recur (rest-l subs) (map replacement-mapper tweet)))))
(defn tokenize-tweet [tokenizer-fn tweet &optional [substitutions []]]
(-> (tokenizer-fn tweet)
(-replace-in-tweet substitutions)))
(defn tokenize-tweet-coll [tokenizer-fn tweet-coll &optional [substitutions []]]
(defn mapper [t]
(tokenize-tweet tokenizer-fn t substitutions))
(list (map mapper tweet-coll)))
(defn create-NLTK-tweet-tokenizer [tweet]
(import [nltk.tokenize [TweetTokenizer]])
(defn create-tokenizer []
(TweetTokenizer :preserve-case True
:strip-handles False
:reduce-len True))
(.tokenize (create-tokenizer) tweet))
(defn create-NLTK-char-tokenizer [tweet]
(.tokenize (CharTokenizer) (.lower tweet)))
(defn tokenize-coll-as-NLTK-tweet [tweet-coll &optional [substitutions []]]
(tokenize-tweet-coll create-NLTK-tweet-tokenizer tweet-coll substitutions))
(defn get-url-substitution []
["https?:\/\/t.co\/[a-zA-Z0-9\-\.]{8}" "https://t.co/"])
(defn get-handle-substitution []
["(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)" "@TwitterHandle"])
(defn get-all-substitutions []
[(get-url-substitution) (get-handle-substitution)])
;; (defn tokenize-file [filename &optional [normalize-urls True] [normalize-handles True]]
;; "By default, will normalize URL's and Usernames in a tweet."
;; (setv replacements [(if normalize-urls (get-url-substitution))
;; (if normalize-handles (get-handle-substitution))])
;; (tokenize-coll-as-NLTK-tweet (-read-file filename) replacements))
;; (defn tokenize-tweet-default [tweet]
;; (tokenize-tweet create-NLTK-tweet-tokenizer tweet (get-all-substitutions)))
;; (defn tokenize-tweet-char [tweet]
;; (tokenize-tweet create-NLTK-char-tokenizer tweet (get-all-substitutions)))
(import [api_key [*]])
(import tweepy)
(import [extensions [*]])
(import [user-lists [*]])
(import pickle)
(import os.path)
(require [extensions [*]])
(require [hy.contrib.loop [*]])
;;;;;;;;;;;;;;;;;;;;;;
;; Global Variables ;;
;;;;;;;;;;;;;;;;;;;;;;
(def tweet-dir "tweets/journalists/")
(def handles journalist-twitter-accounts)
;;;;;;;;;;;;;;;
;; Functions ;;
;;;;;;;;;;;;;;;
(defn -create-pickled-tweets-name [handle]
(defn -make-path [directory filename extension]
(->> (+ directory filename extension)
(.abspath (. os path))))
(-make-path tweet-dir handle ".pickle"))
(defn -get-api []
"Gets an api object with this application's twitter keys"
(setv authenticator (.OAuthHandler tweepy (:ConsumerKey secret-keys)
(:ConsumerSecret secret-keys)))
(setv authenticator.secure True)
(.set_access_token authenticator (:AccessToken secret-keys)
(:AccessTokenSecret secret-keys))
(.API tweepy
:auth_handler authenticator
:wait_on_rate_limit True
:wait_on_rate_limit_notify True))
(defn -extract-tweets-from-status [tweets]
"Given a list of tweepy.status objects, extracts tweet texts"
(->> tweets
(map (fn [t] (.encode t.text "utf-8")))
(list)))
(defn -get-cursor [handle]
"Gets a cursor that iterates of all the tweets posted by screen-name"
(->> (.Cursor tweepy (. (-get-api) user_timeline)
:screen_name handle
:include_rts False)
(.items)))
(defn -get-tweets [handle]
"Uses a cursor with a limit handler to scrape tweets"
(loop [[tweets []] [cursor (-get-cursor handle)]]
(try (recur (cons (next cursor) tweets) cursor)
(except [e StopIteration]
(-extract-tweets-from-status tweets)))))
(defn -dump-tweets-to-file [tweets filename]
"Serializes a tweet collection in a file."
(-> (->> tweets
(map (fn [t] (.decode t "utf-8")))
(list))
(spit filename)))
(defn handle-exists? [handle]
(try
(do (.get-user (-get-api) handle)
True)
(except (tweepy.TweepError)
False)))
(defn scrape-and-pickle [handle &optional [redownload? False]]
"Scrapes and pickles data for a twitter handle."
(setv filename (-create-pickled-tweets-name handle))
(if (and (handle-exists? handle)
(or redownload?
(not (file-exists? filename))))
(do
(print (+ "Downloading tweets for " handle))
(setv tweets (-get-tweets handle))
(-dump-tweets-to-file tweets filename))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Mass Tweet Downloading Functions ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(defn scrape-and-pickle-handles [handle-list &optional [redownload? False]]
"Scrapes and pickles data for a list of twitter handles"
(->> (map (fn [h] (scrape-and-pickle h redownload?)) handle-list)
(list)))
(defmain [&rest args]
(print "Downloading a lot of data, please wait . . .")
(scrape-and-pickle-handles handles (-> (second args) (str) (.lower) (= "true"))))
The pickle files in here are simply lists of plain tweets. Nothing has been chained or tokenized in them.