Commit d67968e6 authored by Kathryn Elliott's avatar Kathryn Elliott

Added code to replace stopwords I don't want removed

parent f0a6306c
......@@ -8,8 +8,8 @@ from pprint import pprint
from topic_model import TopicModel
extra_stopwords = ['would', 'be', 're', 'edu', 'use', 'get', 'not', 'do', 'could', 'go']
extra_stopwords = ['would', 'be', 're', 'edu', 'use', 'get', 'say', 'do', 'could']
extra_non_stopwords = ['to', 'go', 'not']
def load_data(dataset_file):
return pandas.read_json(dataset_file)
......@@ -19,7 +19,7 @@ if (len(sys.argv) < 2):
file = sys.argv[1]
topic_model = TopicModel(load_data(file), min_topic_count=8, max_topic_count=20, extra_stopwords=extra_stopwords)
topic_model = TopicModel(load_data(file), min_topic_count=8, max_topic_count=20, extra_stopwords=extra_stopwords, extra_non_stopwords=extra_non_stopwords)
topic_model.pre_process_corpus()
topic_model.run_lda()
......
......@@ -39,7 +39,7 @@ class TopicModel:
# documents_dataframe -> a pandas dataframe of the corpus
# min_topic_count -> the smallest number of topics to search for while optimising for the number of topics
# max_topic_count -> the maximum number of topics to search for while optimising for the number of topics
def __init__(self, documents_dataframe, min_topic_count=1, max_topic_count=20, extra_stopwords=[]):
def __init__(self, documents_dataframe, min_topic_count=1, max_topic_count=20, extra_stopwords=[], extra_non_stopwords=[]):
self._documents_dataframe = documents_dataframe
self._min_topic_count = min_topic_count
......@@ -54,6 +54,9 @@ class TopicModel:
self._stopwords = stopwords.words('english')
self._stopwords.extend(extra_stopwords)
for word in extra_non_stopwords:
if word in self._stopwords:
self._stopwords.remove(word)
self._nlp = spacy.load('en', disable=['parser', 'ner'])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment