Commit 77de71d1 authored by Kathryn Elliott's avatar Kathryn Elliott

Add more methods to the TopicModel class

* Importing correct gensim dictionarys  and models
* Removing LDA Model constants
* Changing logging from debug to info
* Adding n-gram variables
* Adding class self objects for min topic count & max topic count
* Adding some comments
parent 6d09386c
......@@ -9,9 +9,12 @@ import spacy
from spacy.tokens import Doc
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models.coherencemodel import CoherenceModel
from nltk.corpus import stopwords # stopwords library from nltk
......@@ -22,19 +25,13 @@ import warnings
warnings.filterwarnings("ignore") #, category=DeprecationWarning)
LOG_LEVEL = logging.DEBUG
LOG_LEVEL = logging.INFO
LOG_FORMAT = '%(asctime)s --- %(levelname)5s: [%(filename)s:%(lineno)d] %(message)s'
N_GRAM_OPTIONS = dict(min_count = 5, threshold = 2, delimiter = b' ')
LDA_MODEL_RANDOM_STATE=100
LDA_MODEL_UPDATE_EVERY=1
LDA_MODEL_CHUNKSIZE=100
LDA_MODEL_PASSES=10
LDA_MODEL_ALPHA='auto'
LDA_MODEL_PER_WORD_TOPICS=True
ALLOWED_POS_TAGS = ['NOUN', 'ADJ', 'VERB', 'ADV']
N_GRAM_OPTIONS = dict(min_count=5, threshold=2, delimiter=b' ', progress_per=100000)
LDA_OPTIONS = dict(random_state=100, update_every=0, chunksize=100,
passes=10, alpha='auto', per_word_topics=True, progress_per=100000)
class TopicModel:
......@@ -44,6 +41,10 @@ class TopicModel:
# max_topic_count -> the maximum number of topics to search for while optimising for the number of topics
def __init__(self, documents_dataframe, min_topic_count=1, max_topic_count=20, extra_stopwords=[]):
self._documents_dataframe = documents_dataframe
self._min_topic_count = min_topic_count
self._max_topic_count = max_topic_count
logging.basicConfig(format=LOG_FORMAT, level=LOG_LEVEL)
self._logging = logging.getLogger(str(self.__class__))
......@@ -54,8 +55,6 @@ class TopicModel:
self._stopwords = stopwords.words('english')
self._stopwords.extend(extra_stopwords)
self._documents_dataframe = documents_dataframe
self._nlp = spacy.load('en', disable=['parser', 'ner'])
# This is an Array of documents (which themselves are Arrays).
......@@ -63,7 +62,8 @@ class TopicModel:
def pre_process_corpus(self):
""" Pre-process each document in the corpus """
""" Pre-process each document in the corpus
"""
self._logging.info("Tokenising and removing stopwords.")
tokenised_corpus = [self._tokenise(doc) for doc in self._documents_dataframe.values]
......@@ -75,9 +75,9 @@ class TopicModel:
def corpus(self):
''' Return the corpus.
""" Return the corpus.
return -- [[String]]
'''
"""
return self._corpus
......@@ -87,8 +87,6 @@ class TopicModel:
doc -- String
return -- [String]
"""
pprint (str(doc))
print("--------------")
return simple_preprocess(str(doc), deacc=True)
......@@ -120,7 +118,6 @@ class TopicModel:
return -- [String]
"""
trigram = self._trigrams[self._bigrams[doc]]
pprint(trigram)
return trigram
......@@ -131,3 +128,76 @@ class TopicModel:
self._trigrams = Phraser(Phrases(bigram_phrases[tokenised_corpus], **N_GRAM_OPTIONS))
def run_lda(self):
self._build_dictionary()
self._build_bow()
self._build_model()
def _build_bow(self):
logging.info("Building the BOW representation of the corpus")
self._bow = [self._dictionary.doc2bow(doc) for doc in self._corpus]
def _build_dictionary(self):
self._dictionary = Dictionary(self._corpus)
self._dictionary.filter_extremes(no_below=4, no_above=0.4)
self._dictionary.compactify()
def _topics_generator(self):
print (self._min_topic_count, self._max_topic_count)
for n in range (self._min_topic_count, self._max_topic_count):
yield (n, self._max_topic_count)
def _suppress_logging(self, suppress=True):
self._logging.setLevel(logging.WARN)
def _reenable_logging(self):
self._logging.setLevel(logging.WARN)
def _build_model(self, quiet=True):
models = {}
for topic_number, max_topic in self._topics_generator():
logging.info("Building topic {0} of {1}.".format(topic_number, max_topic))
self._suppress_logging(True)
models[topic_number] = LdaModel(corpus=self._bow, id2word=self._dictionary, num_topics=topic_number, **LDA_OPTIONS)
self._reenable_logging
self._models = models
def find_maximally_coherent_model(self):
max_model = None
max_n = 0
max_coherence_model = None
# Documentation for these parameters: https://radimrehurek.com/gensim/models/coherencemodel.html
for n in self._models:
coherence_model = CoherenceModel(model = self._models[n], texts = self._corpus, dictionary = self._dictionary, coherence='c_v')
if(max_coherence_model == None or coherence_model.get_coherence() > max_coherence_model.get_coherence()):
max_coherence_model = coherence_model
max_n = n
max_model = self._models[n]
return((max_n, max_model))
def print_topics(self, topic_id, topn=10, compact=True):
""" Print the top terms for each topic.
"""
if compact == True:
for topic in self._models[topic_id].show_topics(topn):
print(topic)
else:
for n in range(topic_id):
logging.info("############## {0} ({1}) ##############".format(topic_id, n))
logging.info("{:20} {}".format(u'term', u'frequency') + u'\n')
for term, frequency in self._models[topic_id].show_topic(n - 1, topn):
logging.info("{:20} {:.3f}".format(term, round(frequency, 3)))
logging.info("")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment