Commit 86ccb0cf authored by Kathryn Elliott's avatar Kathryn Elliott

New script to oversee topic modelling

* Imports the topic model
* Loads the datasetRemoves stopwrods
* Loads the dataset
* Initiates the pre-processing
* Runs the LDA topic modelling
parent 638a750e
#!/usr/bin/env python3
import sys
import json
import pandas
from pprint import pprint
from topic_model import TopicModel
extra_stopwords = ['would', 'be', 're', 'edu', 'use', 'get', 'not', 'do', 'could', 'go']
def load_data(dataset_file):
return pandas.read_json(dataset_file)
if (len(sys.argv) < 2):
print("usage: " + sys.argv[0] + " <datafile>")
sys.exit()
file = sys.argv[1]
tm = TopicModel(load_data(file), min_topic_count=8, max_topic_count=20, extra_stopwords=extra_stopwords)
tm.pre_process_corpus()
tm.run_lda()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment