Commit ad4493dd authored by Kathryn Elliott's avatar Kathryn Elliott

Start of complete rewrite of topic modelling code

The code has become a complete mess and I'm becoming more and more
confused!

I've started by looking at the functions that are being used and put
them in a class.
parent 86ccb0cf
from pprint import pprint
from sys import exit
import json
import pandas
import spacy
from spacy.tokens import Doc
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser
from nltk.corpus import stopwords # stopwords library from nltk
from pprint import pformat
import logging
import warnings
warnings.filterwarnings("ignore") #, category=DeprecationWarning)
LOG_LEVEL = logging.DEBUG
LOG_FORMAT = '%(asctime)s --- %(levelname)5s: [%(filename)s:%(lineno)d] %(message)s'
N_GRAM_OPTIONS = dict(min_count = 5, threshold = 2, delimiter = b' ')
LDA_MODEL_RANDOM_STATE=100
LDA_MODEL_UPDATE_EVERY=1
LDA_MODEL_CHUNKSIZE=100
LDA_MODEL_PASSES=10
LDA_MODEL_ALPHA='auto'
LDA_MODEL_PER_WORD_TOPICS=True
ALLOWED_POS_TAGS = ['NOUN', 'ADJ', 'VERB', 'ADV']
class TopicModel:
# Initialise the topic model class.
# documents_dataframe -> a pandas dataframe of the corpus
# min_topic_count -> the smallest number of topics to search for while optimising for the number of topics
# max_topic_count -> the maximum number of topics to search for while optimising for the number of topics
def __init__(self, documents_dataframe, min_topic_count=1, max_topic_count=20, extra_stopwords=[]):
logging.basicConfig(format=LOG_FORMAT, level=LOG_LEVEL)
self._logging = logging.getLogger(str(self.__class__))
self._logging.info("Setting up stopwords.")
self._logging.info("Adding extra stopwords: {0}".format(extra_stopwords))
self._stopwords = stopwords.words('english')
self._stopwords.extend(extra_stopwords)
self._documents_dataframe = documents_dataframe
self._nlp = spacy.load('en', disable=['parser', 'ner'])
# This is an Array of documents (which themselves are Arrays).
self._corpus = []
def pre_process_corpus(self):
""" Pre-process each document in the corpus """
self._logging.info("Tokenising and removing stopwords.")
tokenised_corpus = [self._tokenise(doc) for doc in self._documents_dataframe.values]
self._build_ngram_models(tokenised_corpus)
for tokens in tokenised_corpus:
self._corpus.append(self._lemmatise(self._remove_stopwords(tokens)))
def corpus(self):
''' Return the corpus.
return -- [[String]]
'''
return self._corpus
# Tokenise a single document
def _tokenise(self, doc):
""" Tokenise a single document.
doc -- String
return -- [String]
"""
pprint (str(doc))
print("--------------")
return simple_preprocess(str(doc), deacc=True)
# Remove stopwords for a single document
# doc -> the document as a string
def _remove_stopwords(self, doc):
""" Remove stopwords from a single document.
doc -- [String]
return -- [String]
"""
return [word for word in doc if word not in self._stopwords]
def _lemmatise(self, doc):
""" Lemmatise all words in a single document.
doc -- [String]
return -- [String]
"""
doc_ = self._nlp(" ".join(doc))
return [token.lemma_ for token in doc_ if token.pos_ in ALLOWED_POS_TAGS]
def _trigrams(self, doc):
""" Generate tri-grams for each document. The resultant output are a
concatenation of each tri-gram into a single token using an
underscore. For example: "the cat sat on the mat" would be:
"the_cat_sat, cat_sat_on ..." -- this may be wrong.
doc -- [String]
return -- [String]
"""
trigram = self._trigrams[self._bigrams[doc]]
pprint(trigram)
return trigram
def _build_ngram_models(self, tokenised_corpus):
bigram_phrases = Phrases(tokenised_corpus, **N_GRAM_OPTIONS)
bigram_model = Phraser(bigram_phrases)
self._trigrams = Phraser(Phrases(bigram_phrases[tokenised_corpus], **N_GRAM_OPTIONS))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment