Commit 3a58241e authored by Kathryn Elliott's avatar Kathryn Elliott

Add new parameters & reorganise the argument parsing.

parent 037503ec
......@@ -11,35 +11,39 @@ from pprint import pprint
from topic_model import TopicModel
parser = argparse.ArgumentParser(description='Process some files.')
parser.add_argument('filenames', metavar='filename', type=str, nargs='+',
help='One or more files for the parser')
parser.add_argument('--min', type=int, default=5,
help='Setting the minimum topic count')
parser.add_argument('--max', type=int, default=20,
help='Setting the max topic count')
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG, format='%(relativeCreated)6d %(threadName)s %(message)s')
parser.add_argument('filename', metavar='filename', help='Data file to process', type=str)
parser.add_argument('--min-topic-count', metavar='N', default=5, help='Minimum topic count', type=int)
parser.add_argument('--max-topic-count', metavar='N', default=20, help='Maximum topic count', type=int)
parser.add_argument('--topic-step', metavar='N', default=1, help='The topic step. For example, a step of 2 would give 1, 3, 5', type=int)
parser.add_argument('--pos-tags', metavar='[POS tags]', default=None, help='The Part Of Speech tags to extract from the text', nargs='*')
parser.add_argument('--log-level', metavar='STRING', default="info", help='The log level (debug, info, etc)', )
parser.add_argument('--log-to-console', dest='log_to_console', action='store_true', help='Write log messages to the console, rather than a log file')
parser.add_argument('--tfidf', dest='tfidf', action='store_true', help='Process the term document matrix using tf-idf')
parser.add_argument('--trigrams', dest='trigrams', action='store_true', help='Generate tri-grams')
parser.set_defaults(tfidf=False)
parser.set_defaults(trigrams=False)
parser.set_defaults(log_to_console=False)
args = vars(parser.parse_args())
filename = args.pop('filename')
def load_data(dataset_file):
return pandas.read_json(dataset_file)
#if (len(sys.argv) < 2):
# print("usage: " + sys.argv[0] + " <datafile>")
# sys.exit()
#file = sys.argv[1]
topic_model = TopicModel(load_data(filename), **args)
topic_model = TopicModel(load_data(file), min_topic_count=5, max_topic_count=20, extra_stopwords=extra_stopwords, extra_non_stopwords=extra_non_stopwords)
topic_model.pre_process_corpus()
topic_model.run_lda()
print("")
print("")
print("")
(n, model) = topic_model.find_maximally_coherent_model()
print("########################################")
......
......@@ -43,14 +43,41 @@ LDA_OPTIONS = dict(random_state=100, update_every=0, chunksize=2000,
class TopicModel:
# Initialise the topic model class.
# documents_dataframe -> a pandas dataframe of the corpus
# data -> the corpus in json format
# min_topic_count -> the smallest number of topics to search for while optimising for the number of topics
# max_topic_count -> the maximum number of topics to search for while optimising for the number of topics
def __init__(self, documents_dataframe, min_topic_count=1, max_topic_count=20, extra_stopwords=[], extra_non_stopwords=[]):
# topic_step -> the step in topic range, defaults to 1
# pos_tags -> Parts of Speech to used for processing. Anything not in this list will be ignored
# tfidf -> Optionally process the corpus using tfidf
# trigrams -> Optionally generate tri-grams
# log_level -> The level at which to log
def __init__(self, data, min_topic_count=1, max_topic_count=20, topic_step=1, tfidf=True, pos_tags=DEFAULT_POS_TAGS,
log_to_console=True, log_level=logging.DEBUG, trigrams=False):
self._json = data
self._min_topic_count = min_topic_count
self._max_topic_count = max_topic_count
self._topic_step = topic_step
self._pos_tags = pos_tags
self._tfidf = tfidf
self._trigrams = trigrams
current_time = datetime.now()
unique_run_id = str(current_time.strftime("%s.%f"))
self._persistance = TopicPersistance(base_dir = "/tmp/LDA/{0}".format(unique_run_id))
self._persistance.save_params(dict(min_topic_count=min_topic_count, max_topic_count=max_topic_count,
topic_step=topic_step, tfidf=tfidf, trigrams=trigrams, pos_tags=pos_tags,
timestamp=current_time.isoformat(), git=self.repo_info()))
if log_to_console:
logging.basicConfig(format=LOG_FORMAT, level=LOG_LEVEL)
else:
log_file = "/tmp/LDA/{0}/topic-model.log".format(unique_run_id)
print("Saving log file to: " + log_file)
logging.basicConfig(format=LOG_FORMAT, level=LOG_LEVEL, filename=log_file)
self._documents_dataframe = documents_dataframe
self._min_topic_count = min_topic_count
self._max_topic_count = max_topic_count
logging.basicConfig(format=LOG_FORMAT, level=LOG_LEVEL)
......@@ -253,12 +280,14 @@ class TopicModel:
def export_topic_visualisation_data(self, topic_id):
model = self._models[topic_id]
html_filename = TOPICS_VISUALISATION_PREFIX + "." + str(topic_id) + ".html"
vis = pyLDAvis.gensim.prepare(model, self._bow, self._dictionary)
self._persistance.save_visualisation(vis, topic_id)
def repo_info(self):
return subprocess.check_output(["git", "describe", "--always"]).decode("utf-8").strip()
# Simple class to hide the details of saving data to the filesystem.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment