Commit ad4493dd authored by Kathryn Elliott's avatar Kathryn Elliott

Start of complete rewrite of topic modelling code

The code has become a complete mess and I'm becoming more and more

I've started by looking at the functions that are being used and put
them in a class.
parent 86ccb0cf
from pprint import pprint
from sys import exit
import json
import pandas
import spacy
from spacy.tokens import Doc
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser
from nltk.corpus import stopwords # stopwords library from nltk
from pprint import pformat
import logging
import warnings
warnings.filterwarnings("ignore") #, category=DeprecationWarning)
LOG_FORMAT = '%(asctime)s --- %(levelname)5s: [%(filename)s:%(lineno)d] %(message)s'
N_GRAM_OPTIONS = dict(min_count = 5, threshold = 2, delimiter = b' ')
class TopicModel:
# Initialise the topic model class.
# documents_dataframe -> a pandas dataframe of the corpus
# min_topic_count -> the smallest number of topics to search for while optimising for the number of topics
# max_topic_count -> the maximum number of topics to search for while optimising for the number of topics
def __init__(self, documents_dataframe, min_topic_count=1, max_topic_count=20, extra_stopwords=[]):
logging.basicConfig(format=LOG_FORMAT, level=LOG_LEVEL)
self._logging = logging.getLogger(str(self.__class__))"Setting up stopwords.")"Adding extra stopwords: {0}".format(extra_stopwords))
self._stopwords = stopwords.words('english')
self._documents_dataframe = documents_dataframe
self._nlp = spacy.load('en', disable=['parser', 'ner'])
# This is an Array of documents (which themselves are Arrays).
self._corpus = []
def pre_process_corpus(self):
""" Pre-process each document in the corpus """"Tokenising and removing stopwords.")
tokenised_corpus = [self._tokenise(doc) for doc in self._documents_dataframe.values]
for tokens in tokenised_corpus:
def corpus(self):
''' Return the corpus.
return -- [[String]]
return self._corpus
# Tokenise a single document
def _tokenise(self, doc):
""" Tokenise a single document.
doc -- String
return -- [String]
pprint (str(doc))
return simple_preprocess(str(doc), deacc=True)
# Remove stopwords for a single document
# doc -> the document as a string
def _remove_stopwords(self, doc):
""" Remove stopwords from a single document.
doc -- [String]
return -- [String]
return [word for word in doc if word not in self._stopwords]
def _lemmatise(self, doc):
""" Lemmatise all words in a single document.
doc -- [String]
return -- [String]
doc_ = self._nlp(" ".join(doc))
return [token.lemma_ for token in doc_ if token.pos_ in ALLOWED_POS_TAGS]
def _trigrams(self, doc):
""" Generate tri-grams for each document. The resultant output are a
concatenation of each tri-gram into a single token using an
underscore. For example: "the cat sat on the mat" would be:
"the_cat_sat, cat_sat_on ..." -- this may be wrong.
doc -- [String]
return -- [String]
trigram = self._trigrams[self._bigrams[doc]]
return trigram
def _build_ngram_models(self, tokenised_corpus):
bigram_phrases = Phrases(tokenised_corpus, **N_GRAM_OPTIONS)
bigram_model = Phraser(bigram_phrases)
self._trigrams = Phraser(Phrases(bigram_phrases[tokenised_corpus], **N_GRAM_OPTIONS))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment