Commit 5709bb1b authored by Kathryn Elliott's avatar Kathryn Elliott

Add basic functionality to export the document to documend id map.

parent 73511c16
......@@ -2,6 +2,7 @@
import argparse
import logging
import sys
from topic_model import TopicModel
......@@ -13,8 +14,10 @@ parser.add_argument('--max-topic-count', metavar='N', default=20,
parser.add_argument('--topic-step', metavar='N', default=1, help='The topic step. For example, a step of 2 would give 1, 3, 5', type=int)
parser.add_argument('--pos-tags', metavar='[POS tags]', default=None, help='The Part Of Speech tags to extract from the text', nargs='*')
parser.add_argument('--log-level', metavar='STRING', default="info", help='The log level (debug, info, etc)', )
parser.add_argument('--export-doc-map', metavar='STRING', default=None, help='Export the document-document id mapping to given file', )
parser.add_argument('--log-to-console', dest='log_to_console', action='store_true', help='Write log messages to the console, rather than a log file')
parser.add_argument('--tfidf', dest='tfidf', action='store_true', help='Process the term document matrix using tf-idf')
parser.add_argument('--trigrams', dest='trigrams', action='store_true', help='Generate tri-grams')
parser.add_argument('--bigrams', dest='bigrams', action='store_true', help='Generate bi-grams')
......@@ -26,11 +29,20 @@ parser.set_defaults(log_to_console=False)
args = vars(parser.parse_args())
doc_map_filename = args.pop('export_doc_map')
topic_model = TopicModel(args.pop('filename'), **args)
if doc_map_filename:
topic_model.export_document_id_map(doc_map_filename)
sys.exit()
topic_model.run_lda()
topic_model.calculate_coherence_models()
topic_model.export_document_id_map()
topic_model.export_all_topics_per_documents()
topic_model.export_all_topic_visualisation_data()
......
......@@ -144,8 +144,16 @@ class TopicModel:
def _load_hasd_and_decode(self, filename):
with open(filename) as fd:
data = fd.read()
content = json.loads(data)["content"]
return (json.loads(data)["content"].values(), self.hash_input_data(data))
n = 0
self._document_docid_map = []
for key in content.keys():
self._document_docid_map.append((n, key))
n += 1
return (content.values(), self.hash_input_data(data))
def corpus(self):
......@@ -318,6 +326,20 @@ class TopicModel:
self.export_topics_per_documents(topic_id)
def export_document_id_map(self):
fields = ["document_id", "document_name"]
path = self._persistance.document_map_csv_file_path()
self._logging.info("Exporting document to document_id map")
with open(path, 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fields, delimiter="\t")
writer.writeheader()
for (k, v) in self._document_docid_map:
writer.writerow({"document_id": k, "document_name": v})
def print_topics_per_documents(self, topic_id):
model = self._models[topic_id]
......@@ -430,9 +452,6 @@ class TopicPersistance:
def csv_file_path(self, model_id):
return self._mk_part(model_id, "topic-documents.csv")
def open_topic_documents_csv(self, model_id):
with open(self._mk_part(model_id, "topic-documents"), 'w', newline='') as csvfile:
try:
yield csvfile
finally:
csvfile.close()
def document_map_csv_file_path(self):
return str(self._base_dir.joinpath("document-document-id.csv"))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment