Commit 73511c16 authored by Kathryn Elliott's avatar Kathryn Elliott

Separate bigrams & trigram

This means bi-grams & tri-grams can be specified separately on the
command line.
parent e591c82a
......@@ -17,9 +17,11 @@ parser.add_argument('--log-level', metavar='STRING', default="info",
parser.add_argument('--log-to-console', dest='log_to_console', action='store_true', help='Write log messages to the console, rather than a log file')
parser.add_argument('--tfidf', dest='tfidf', action='store_true', help='Process the term document matrix using tf-idf')
parser.add_argument('--trigrams', dest='trigrams', action='store_true', help='Generate tri-grams')
parser.add_argument('--bigrams', dest='bigrams', action='store_true', help='Generate bi-grams')
parser.set_defaults(tfidf=False)
parser.set_defaults(trigrams=False)
parser.set_defaults(bigrams=False)
parser.set_defaults(log_to_console=False)
args = vars(parser.parse_args())
......
......@@ -57,9 +57,10 @@ class TopicModel:
# pos_tags -> Parts of Speech to used for processing. Anything not in this list will be ignored
# tfidf -> Optionally process the corpus using tfidf
# trigrams -> Optionally generate tri-grams
# bigrams -> Optionally generate bi-grams
# log_level -> The level at which to log
def __init__(self, filename, min_topic_count=1, max_topic_count=20, topic_step=1, tfidf=True, pos_tags=DEFAULT_POS_TAGS,
log_to_console=True, log_level=logging.DEBUG, trigrams=False):
log_to_console=True, log_level=logging.DEBUG, bigrams=False, trigrams=False):
current_time = datetime.now()
unique_run_id = str(current_time.strftime("%s.%f"))
......@@ -81,13 +82,14 @@ class TopicModel:
self._topic_step = topic_step
self._pos_tags = pos_tags
self._tfidf = tfidf
self._bigrams = bigrams
self._trigrams = trigrams
self._persistance.save_params(json.dumps(dict(input_filename=filename, input_file_sha256=file_hash,
min_topic_count=min_topic_count, max_topic_count=max_topic_count,
topic_step=topic_step, tfidf=tfidf, trigrams=trigrams, pos_tags=pos_tags,
timestamp=current_time.isoformat(), git=self.repo_info())))
topic_step=topic_step, tfidf=tfidf, trigrams=trigrams, bigrams=bigrams,
pos_tags=pos_tags, timestamp=current_time.isoformat(), git=self.repo_info())))
self._logging = logging.getLogger(str(self.__class__))
......@@ -117,13 +119,22 @@ class TopicModel:
for doc_tokens in tokenised_corpus:
self._corpus.append(self._lemmatise(self._remove_stopwords(doc_tokens)))
if self._trigrams:
trigrams = self._build_ngrams(self._corpus)
if self._bigrams or self._trigrams:
(bigrams, trigrams) = self._build_ngrams(self._corpus)
if self._bigrams:
self._logging.info("Appending bi-grams to the dictionary as specified.")
for n in range(len(self._corpus)):
for token in bigrams[self._corpus[n]]:
self._corpus[n].append(token)
if self._trigrams:
self._logging.info("Appending tri-grams to the dictionary as specified.")
for n in range(len(self._corpus)):
for token in trigrams[self._corpus[n]]:
self._corpus[n].append(token)
self._logging.info("Appending tri-grams to the dictionary as specified.")
for n in range(len(self._corpus)):
for token in trigrams[self._corpus[n]]:
self._corpus[n].append(token)
else:
self._logging.info("Skipping tri-gram processing.")
......@@ -190,7 +201,9 @@ class TopicModel:
bigram_phrases = Phrases(tokenised_corpus, **N_GRAM_OPTIONS)
self._logging.info("Building tri-grams")
return Phraser(Phrases(bigram_phrases[tokenised_corpus], **N_GRAM_OPTIONS))
trigram_phrases = Phraser(Phrases(bigram_phrases[tokenised_corpus], **N_GRAM_OPTIONS))
return (bigram_phrases, trigram_phrases)
def run_lda(self):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment