Commit 7553ff45 authored by Kathryn Elliott's avatar Kathryn Elliott

Sort out n-grams and tf-idf.

parent 3a58241e
......@@ -11,11 +11,12 @@ from spacy.tokens import Doc
import gensim
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.phrases import Phrases, Phraser
from gensim.models.coherencemodel import CoherenceModel
from nltk.corpus import stopwords # stopwords library from nltk
import pyLDAvis
......@@ -84,7 +85,6 @@ class TopicModel:
self._logging = logging.getLogger(str(self.__class__))
self._logging.info("Setting up stopwords.")
self._logging.info("Adding extra stopwords: {0}".format(extra_stopwords))
self._stopwords = stopwords.words('english')
self._stopwords.extend(['woolworths', 'coles'])
......@@ -93,19 +93,34 @@ class TopicModel:
# This is an Array of documents (which themselves are Arrays).
self._corpus = []
self._corpus_size = -1
def pre_process_corpus(self):
""" Pre-process each document in the corpus
"""
self._logging.info("Tokenising and removing stopwords.")
self._logging.info("Tokenising corpus.")
tokenised_corpus = [self._tokenise(doc) for doc in self._json.values]
self._logging.info("Loaded {0} documents.".format(len(tokenised_corpus)))
self._logging.info("Removing stopwords, lemmatising & building the corpus")
self._logging.info("Keeping lemmatised POS tags: {0}".format(str(self._pos_tags)))
tokenised_corpus = [self._tokenise(doc) for doc in self._documents_dataframe.values]
for doc_tokens in tokenised_corpus:
self._corpus.append(self._lemmatise(self._remove_stopwords(doc_tokens)))
self._build_ngram_models(tokenised_corpus)
if self._trigrams:
trigrams = self._build_ngrams(self._corpus)
for tokens in tokenised_corpus:
self._corpus.append(self._lemmatise(self._remove_stopwords(tokens)))
self._logging.info("Appending tri-grams to the dictionary as specified.")
for n in range(len(self._corpus)):
for token in trigrams[self._corpus[n]]:
self._corpus[n].append(token)
else:
self._logging.info("Skipping tri-gram processing.")
self._corpus_size = len(tokenised_corpus)
def corpus(self):
......@@ -140,7 +155,7 @@ class TopicModel:
return -- [String]
"""
doc_ = self._nlp(" ".join(doc))
return [token.lemma_ for token in doc_ if token.pos_ in ALLOWED_POS_TAGS]
return [token.lemma_ for token in doc_ if token.pos_ in self._pos_tags]
def _trigrams(self, doc):
......@@ -155,51 +170,72 @@ class TopicModel:
return trigram
def _build_ngram_models(self, tokenised_corpus):
# Generates tri-grams from the tokenised corpus
def _build_ngrams(self, tokenised_corpus):
self._logging.info("Building bi-grams")
bigram_phrases = Phrases(tokenised_corpus, **N_GRAM_OPTIONS)
bigram_model = Phraser(bigram_phrases)
self._trigrams = Phraser(Phrases(bigram_phrases[tokenised_corpus], **N_GRAM_OPTIONS))
self._logging.info("Building tri-grams")
return Phraser(Phrases(bigram_phrases[tokenised_corpus], **N_GRAM_OPTIONS))
def run_lda(self):
self.pre_process_corpus()
self._build_dictionary()
self._build_bow()
self._build_model()
self._build_tfidf()
self._build_models()
def _build_bow(self): # bow (bag of words) dictionary of number of words and how many times those words appear
logging.info("Building the BOW representation of the corpus")
# BOW (bag of words) dictionary of number of words and how many times those words appear
def _build_bow(self):
self._logging.info("Building the BOW representation of the corpus")
self._bow = [self._dictionary.doc2bow(doc) for doc in self._corpus]
def _build_dictionary(self):
def _build_tfidf(self):
if self._tfidf:
self._logging.info("Building TF-IDF model as specified.")
self._tfidf_corpus = TfidfModel(self._bow, id2word=self._dictionary)
self._foo = self._tfidf_corpus[self._bow]
else:
self._logging.info("Skipping TF-IDF processing.")
self._foo = self._bow
def _build_dictionary(self, tfidf=True, compactify=True):
self._logging.info("Building Dictionary.")
self._dictionary = Dictionary(self._corpus)
self._dictionary.filter_extremes(no_below=4, no_above=0.4)
self._dictionary.compactify()
self._logging.info("Dictionary Stats - num of documents: {0}".format(self._dictionary.num_docs))
self._logging.info("Dictionary Stats - num of tokens: {0}".format(self._dictionary.num_pos))
if compactify:
# Ignore words that appear in less than 50 documents or more than 20% documents
# self._dictionary.filter_extremes(no_below=50, no_above=0.2)
self._dictionary.compactify()
def _topics_generator(self):
print (self._min_topic_count, self._max_topic_count)
for n in range (self._min_topic_count, self._max_topic_count):
for n in range (self._min_topic_count, self._max_topic_count, self._topic_step):
yield (n, self._max_topic_count)
def _suppress_logging(self, suppress=True):
self._logging.setLevel(logging.WARN)
def _reenable_logging(self):
self._logging.setLevel(logging.WARN)
def _build_model(self, quiet=True):
def _build_models(self, quiet=True):
models = {}
for topic_number, max_topic in self._topics_generator():
logging.info("Building topic {0} of {1}.".format(topic_number, max_topic))
self._suppress_logging(True)
models[topic_number] = LdaModel(corpus=self._bow, id2word=self._dictionary, num_topics=topic_number, **LDA_OPTIONS)
self._reenable_logging
self._logging.info("Building topic {0} of {1}.".format(topic_number, max_topic))
models[topic_number] = LdaModel(corpus=self._foo, id2word=self._dictionary, num_topics=topic_number, **LDA_OPTIONS)
self._models = models
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment