topic_model.py 6.69 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
from pprint import pprint

from sys import exit

import json
import pandas

import spacy
from spacy.tokens import Doc

import gensim
from gensim.utils import simple_preprocess
13 14
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
15
from gensim.models.phrases import Phrases, Phraser
16 17
from gensim.models.coherencemodel import CoherenceModel

18 19 20 21 22 23 24 25 26 27

from nltk.corpus import stopwords # stopwords library from nltk

from pprint import pformat

import logging
import warnings

warnings.filterwarnings("ignore") #, category=DeprecationWarning)

28
LOG_LEVEL = logging.INFO
29 30 31
LOG_FORMAT = '%(asctime)s --- %(levelname)5s: [%(filename)s:%(lineno)d] %(message)s'

ALLOWED_POS_TAGS = ['NOUN', 'ADJ', 'VERB', 'ADV']
Kathryn Elliott's avatar
Kathryn Elliott committed
32
N_GRAM_OPTIONS = dict(min_count=5, threshold=2, delimiter=b' ')
33
LDA_OPTIONS = dict(random_state=100, update_every=0, chunksize=100,
Kathryn Elliott's avatar
Kathryn Elliott committed
34
                   passes=10, alpha='auto', per_word_topics=True)
35 36 37 38 39 40 41 42 43

class TopicModel:

    # Initialise the topic model class.
    # documents_dataframe -> a pandas dataframe of the corpus
    # min_topic_count     -> the smallest number of topics to search for while optimising for the number of topics
    # max_topic_count     -> the maximum number of topics to search for while optimising for the number of topics
    def __init__(self, documents_dataframe, min_topic_count=1, max_topic_count=20, extra_stopwords=[]):

44 45 46 47
        self._documents_dataframe = documents_dataframe
        self._min_topic_count = min_topic_count
        self._max_topic_count = max_topic_count

48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
        logging.basicConfig(format=LOG_FORMAT, level=LOG_LEVEL)

        self._logging = logging.getLogger(str(self.__class__))

        self._logging.info("Setting up stopwords.")
        self._logging.info("Adding extra stopwords: {0}".format(extra_stopwords))

        self._stopwords = stopwords.words('english')
        self._stopwords.extend(extra_stopwords)

        self._nlp = spacy.load('en', disable=['parser', 'ner'])

        # This is an Array of documents (which themselves are Arrays).
        self._corpus = []


    def pre_process_corpus(self):
65 66
        """ Pre-process each document in the corpus
        """
67 68 69 70 71 72 73 74 75 76 77
        self._logging.info("Tokenising and removing stopwords.")

        tokenised_corpus = [self._tokenise(doc)  for doc in self._documents_dataframe.values]

        self._build_ngram_models(tokenised_corpus)

        for tokens in tokenised_corpus:
            self._corpus.append(self._lemmatise(self._remove_stopwords(tokens)))


    def corpus(self):
78
        """ Return the corpus.
79
            return -- [[String]]
80
        """
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
        return self._corpus


    # Tokenise a single document
    def _tokenise(self, doc):
        """ Tokenise a single document.
            doc -- String
            return -- [String]
        """
        return simple_preprocess(str(doc), deacc=True)


    # Remove stopwords for a single document
    # doc -> the document as a string
    def _remove_stopwords(self, doc):
        """ Remove stopwords from a single document.
            doc -- [String]
            return -- [String]
        """
        return [word for word in doc if word not in self._stopwords]


    def _lemmatise(self, doc):
        """ Lemmatise all words in a single document.
            doc -- [String]
            return -- [String]
        """
        doc_ = self._nlp(" ".join(doc))
        return [token.lemma_ for token in doc_ if token.pos_ in ALLOWED_POS_TAGS]


    def _trigrams(self, doc):
        """ Generate tri-grams for each document. The resultant output are a
            concatenation of each tri-gram into a single token using an
            underscore. For example: "the cat sat on the mat" would be:
Kathryn Elliott's avatar
Kathryn Elliott committed
116
            "the_cat_sat, cat_sat_on ..." -- although this may be wrong.
117 118 119 120 121 122 123 124 125 126 127 128 129 130
            doc -- [String]
            return -- [String]
        """
        trigram = self._trigrams[self._bigrams[doc]]
        return trigram


    def _build_ngram_models(self, tokenised_corpus):
        bigram_phrases = Phrases(tokenised_corpus, **N_GRAM_OPTIONS)
        bigram_model = Phraser(bigram_phrases)

        self._trigrams = Phraser(Phrases(bigram_phrases[tokenised_corpus], **N_GRAM_OPTIONS))


131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
    def run_lda(self):
        self._build_dictionary()
        self._build_bow()
        self._build_model()


    def _build_bow(self):
        logging.info("Building the BOW representation of the corpus")
        self._bow = [self._dictionary.doc2bow(doc) for doc in self._corpus]


    def _build_dictionary(self):
        self._dictionary = Dictionary(self._corpus)
        self._dictionary.filter_extremes(no_below=4, no_above=0.4)
        self._dictionary.compactify()


    def _topics_generator(self):
        print (self._min_topic_count, self._max_topic_count)
        for n in range (self._min_topic_count, self._max_topic_count):
            yield (n, self._max_topic_count)


    def _suppress_logging(self, suppress=True):
        self._logging.setLevel(logging.WARN)

    def _reenable_logging(self):
        self._logging.setLevel(logging.WARN)


    def _build_model(self, quiet=True):
        models = {}

        for topic_number, max_topic in self._topics_generator():
            logging.info("Building topic {0} of {1}.".format(topic_number, max_topic))
            self._suppress_logging(True)
            models[topic_number] = LdaModel(corpus=self._bow, id2word=self._dictionary, num_topics=topic_number, **LDA_OPTIONS)
            self._reenable_logging

        self._models = models

    def find_maximally_coherent_model(self):
        max_model = None
        max_n = 0
        max_coherence_model = None

        # Documentation for these parameters: https://radimrehurek.com/gensim/models/coherencemodel.html
        for n in self._models:
            coherence_model = CoherenceModel(model = self._models[n], texts = self._corpus, dictionary = self._dictionary, coherence='c_v')

            if(max_coherence_model == None or coherence_model.get_coherence() > max_coherence_model.get_coherence()):
                max_coherence_model = coherence_model
                max_n = n
                max_model = self._models[n]

        return((max_n, max_model))


    def print_topics(self, topic_id, topn=10, compact=True):
        """ Print the top terms for each topic.
        """
        if compact == True:
            for topic in self._models[topic_id].show_topics(topn):
                print(topic)
        else:
            for n in range(topic_id):
                logging.info("############## {0} ({1}) ##############".format(topic_id, n))
                logging.info("{:20} {}".format(u'term', u'frequency') + u'\n')

                for term, frequency in self._models[topic_id].show_topic(n - 1, topn):
                    logging.info("{:20} {:.3f}".format(term, round(frequency, 3)))

                logging.info("")