Commit cf20538f authored by Kathryn Elliott's avatar Kathryn Elliott

Replace pandas with json

I've done this as the input format is fairly simle and I want the raw
json input so I can calulate the sha256 hash of it.
parent 91e6aba2
......@@ -62,24 +62,11 @@ class TopicModel:
def __init__(self, filename, min_topic_count=1, max_topic_count=20, topic_step=1, tfidf=True, pos_tags=DEFAULT_POS_TAGS,
log_to_console=True, log_level=logging.DEBUG, trigrams=False):
self._json = self.load_data(filename)
self._min_topic_count = min_topic_count
self._max_topic_count = max_topic_count
self._topic_step = topic_step
self._pos_tags = pos_tags
self._tfidf = tfidf
self._trigrams = trigrams
current_time = datetime.now()
unique_run_id = str(current_time.strftime("%s.%f"))
self._persistance = TopicPersistance(base_dir = "/tmp/LDA/{0}".format(unique_run_id))
self._persistance.save_params(dict(input_filename=filename, input_file_sha256=hash_input_data(self._json)
min_topic_count=min_topic_count, max_topic_count=max_topic_count,
topic_step=topic_step, tfidf=tfidf, trigrams=trigrams, pos_tags=pos_tags,
timestamp=current_time.isoformat(), git=self.repo_info()))
if log_to_console:
logging.basicConfig(format=LOG_FORMAT, level=LOG_LEVEL)
else:
......@@ -88,8 +75,23 @@ class TopicModel:
logging.basicConfig(format=LOG_FORMAT, level=LOG_LEVEL, filename=log_file)
(self._json, file_hash) = self._load_hasd_and_decode(filename)
logging.basicConfig(format=LOG_FORMAT, level=LOG_LEVEL)
self._min_topic_count = min_topic_count
self._max_topic_count = max_topic_count
self._topic_step = topic_step
self._pos_tags = pos_tags
self._tfidf = tfidf
self._trigrams = trigrams
self._persistance.save_params(json.dumps(dict(input_filename=filename, input_file_sha256=file_hash,
min_topic_count=min_topic_count, max_topic_count=max_topic_count,
topic_step=topic_step, tfidf=tfidf, trigrams=trigrams, pos_tags=pos_tags,
timestamp=current_time.isoformat(), git=self.repo_info())))
self._logging = logging.getLogger(str(self.__class__))
self._logging.info("Setting up stopwords.")
......@@ -108,7 +110,7 @@ class TopicModel:
""" Pre-process each document in the corpus
"""
self._logging.info("Tokenising corpus.")
tokenised_corpus = [self._tokenise(doc) for doc in self._json.values]
tokenised_corpus = [self._tokenise(doc) for doc in self._json]
self._logging.info("Loaded {0} documents.".format(len(tokenised_corpus)))
......@@ -131,8 +133,11 @@ class TopicModel:
self._corpus_size = len(tokenised_corpus)
def load_data(self, filename):
return pandas.read_json(filename)
def _load_hasd_and_decode(self, filename):
with open(filename) as fd:
data = fd.read()
return (json.loads(data)["content"].values(), self.hash_input_data(data))
def corpus(self):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment