Commit a5ef3072 authored by Kathryn Elliott's avatar Kathryn Elliott

Move the function that reads the input data to the TopicModel class

The reason this was originally done was so the filename can be saved to
the metafile.
parent e22a6a50
......@@ -29,15 +29,8 @@ parser.set_defaults(trigrams=False)
parser.set_defaults(log_to_console=False)
args = vars(parser.parse_args())
filename = args.pop('filename')
def load_data(dataset_file):
return pandas.read_json(dataset_file)
topic_model = TopicModel(load_data(filename), **args)
topic_model = TopicModel(args.pop('filename'), **args)
topic_model.run_lda()
print("")
......
......@@ -48,20 +48,20 @@ LDA_OPTIONS = dict(random_state=100, update_every=0, chunksize=2000,
class TopicModel:
# Initialise the topic model class.
# data -> the corpus in json format
# min_topic_count -> the smallest number of topics to search for while optimising for the number of topics
# max_topic_count -> the maximum number of topics to search for while optimising for the number of topics
# topic_step -> the step in topic range, defaults to 1
#
# filename -> The name of the JSON containing the corpus data
# min_topic_count -> The smallest number of topics to search for while optimising for the number of topics
# max_topic_count -> The maximum number of topics to search for while optimising for the number of topics
# topic_step -> The step in topic range, defaults to 1
# pos_tags -> Parts of Speech to used for processing. Anything not in this list will be ignored
# tfidf -> Optionally process the corpus using tfidf
# trigrams -> Optionally generate tri-grams
# log_level -> The level at which to log
def __init__(self, data, min_topic_count=1, max_topic_count=20, topic_step=1, tfidf=True, pos_tags=DEFAULT_POS_TAGS,
def __init__(self, filename, min_topic_count=1, max_topic_count=20, topic_step=1, tfidf=True, pos_tags=DEFAULT_POS_TAGS,
log_to_console=True, log_level=logging.DEBUG, trigrams=False):
self._json = data
self._json = self.load_data(filename)
self._min_topic_count = min_topic_count
self._max_topic_count = max_topic_count
self._topic_step = topic_step
......@@ -74,7 +74,7 @@ class TopicModel:
self._persistance = TopicPersistance(base_dir = "/tmp/LDA/{0}".format(unique_run_id))
self._persistance.save_params(dict(min_topic_count=min_topic_count, max_topic_count=max_topic_count,
self._persistance.save_params(dict(input_filename=filename, min_topic_count=min_topic_count, max_topic_count=max_topic_count,
topic_step=topic_step, tfidf=tfidf, trigrams=trigrams, pos_tags=pos_tags,
timestamp=current_time.isoformat(), git=self.repo_info()))
......@@ -129,6 +129,10 @@ class TopicModel:
self._corpus_size = len(tokenised_corpus)
def load_data(self, filename):
return pandas.read_json(filename)
def corpus(self):
""" Return the corpus.
return -- [[String]]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment