Commit 844565b2 authored by Kathryn Elliott's avatar Kathryn Elliott

Re-factoring code:

* Commented out caching, as I think this is causing problems
* Changed data source, max & min topic counts
parent c6d1c43d
......@@ -46,8 +46,8 @@ import warnings
# time. Using 20-group newsgroup dataset which contains 11,000 posts. Dataset
# available from https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
# Information about the origin of this dataset is here: http://kdd.ics.uci.edu/databases/20newsgrou$
MIN_TOPIC_COUNT = 6
MAX_TOPIC_COUNT = 10
MIN_TOPIC_COUNT = 5
MAX_TOPIC_COUNT = 20
# MIN_BIGRAM_COUNT is the minimum number of times a token couple occurs before
# counted it is recorded as a bigram. Gensim's default is 5.
......@@ -64,7 +64,7 @@ PHRASES_THRESHOLD = 100
N_GRAM_DEBUG_LOGGING = True
DEBUG_OUTPUT_PATH = "debugOutput/"
# TODO: Change code to read the data file from the command line? ARGV?
DATASET_FILE = 'newsgroups_cleaned_100.json'
DATASET_FILE = 'newsgroups-cleaned.json'
ALLOWED_POS_TAGS = ['NOUN', 'ADJ', 'VERB', 'ADV']
STOPWORDS = stopwords.words('english')
......@@ -77,7 +77,7 @@ LDA_MODEL_PASSES=10
LDA_MODEL_ALPHA='auto'
LDA_MODEL_PER_WORD_TOPICS=True
CACHE_DIRECTORY = "tmp/models"
# _DIRECTORY = "tmp/models"
if DEBUG_OUTPUT_PATH:
......@@ -85,11 +85,11 @@ if DEBUG_OUTPUT_PATH:
os.makedirs(DEBUG_OUTPUT_PATH+"/bigram", exist_ok=True)
os.makedirs(DEBUG_OUTPUT_PATH+"/trigram", exist_ok=True)
if os.path.exists(CACHE_DIRECTORY) == False:
os.makedirs(CACHE_DIRECTORY)
CACHED=False
else:
CACHED=True
# if os.path.exists(CACHE_DIRECTORY) == False:
# os.makedirs(CACHE_DIRECTORY)
# CACHED=False
# else:
# CACHED=True
# Enable logging, set overall level and format of messages.
......@@ -106,35 +106,28 @@ warnings.filterwarnings("ignore", category=DeprecationWarning)
# !!!!!!! Defining functions !!!!!!!
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# !!!! Import the dataset, convert to a list, clean and produce data corpus !!!!
def make_corpus(dataset_file):
def sentence_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=Tr$
pandas_dataframe = pandas.read_json(dataset_file) # convert the json formatted dataset to a pandas dataframe
# Logging for the import of dataset and checking data.
logging.info("Import Dataset: {0}".format(DATASET_FILE))
logging.debug("Newsgroups: {0}s".format(", ".join(pandas_dataframe.target_names.unique())))
logging.debug("First five datafile items:\n{0}".format(pandas_dataframe.head(5)))
def tokenise_documents(documents):
for document in documents:
yield(gensim.utils.simple_preprocess(str(document), deacc=True))
data = pandas_dataframe.content.values.tolist() # convert pandas dataframe to a list
logging.debug("Removing emails.")
data1 = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
# !!!! Import the dataset, convert to a list, clean and produce data corpus !!!!
def make_corpus(dataset_file):
pandas_dataframe = pandas.read_json(dataset_file) # convert the json formatted dataset to a pandas dataframe
logging.debug("Removing new lines.")
data2 = [re.sub('\s+', ' ', sent) for sent in data1]
# Logging for the import of dataset and checking data.
logging.info("Import Dataset: {0}".format(DATASET_FILE))
logging.debug("Newsgroups: {0}s".format(", ".join(pandas_dataframe.target_names.unique())))
logging.debug("First five datafile items:\n{0}".format(pandas_dataframe.head(5)))
logging.debug("Removing single quotes.") # Remove single quotes
data3 = [re.sub("\'", "", sent) for sent in data2]
data = pandas_dataframe.content.values.tolist() # convert pandas dataframe to a list
data_corpus_of_words = list(sentence_to_words(data3))
data_corpus_of_words = list(tokenise_documents(data))
logging.debug("First document (words): {0}".format(format(data[:1][0])))
logging.debug("First document (tokenised): {0}".format(format(data_corpus_of_words[:1][0])))
logging.debug("First document (words): {0}".format(format(data[:1][0])))
logging.debug("First document (tokenised): {0}".format(format(data_corpus_of_words[:1][0])))
return data_corpus_of_words
return data_corpus_of_words
# !!!! Remove stopwords !!!!
......@@ -209,11 +202,11 @@ def generate_models(corpus, id2word, min_topic_count, max_topic_count):
id2word=id2word,
num_topics=n,
random_state=LDA_MODEL_RANDOM_STATE,
LDA_MODEL_UPDATE_EVERY=1,
LDA_MODEL_CHUNKSIZE=100,
LDA_MODEL_PASSES=10,
LDA_MODEL_ALPHA='auto',
LDA_MODEL_PER_WORD_TOPICS=True)
update_every=LDA_MODEL_UPDATE_EVERY,
chunksize=LDA_MODEL_CHUNKSIZE,
passes=LDA_MODEL_PASSES,
alpha=LDA_MODEL_ALPHA,
per_word_topics=LDA_MODEL_PER_WORD_TOPICS)
model_list.append(model)
......@@ -258,6 +251,8 @@ def find_maximally_coherent_model(models, lemmatised_data, dictionary, min_topic
max_n = n + min_topic_number
max_model = models[n]
return((max_n, max_model))
......@@ -275,7 +270,7 @@ def log_topic_keywords(models, min_topic_count, max_topic_count):
""" Log keywords for the all models. """
logging.debug("All topics:")
for n in range(len(models)):
log_topic_n_keywords(models[n], min_topic_count)
log_topic_n_keywords(models[n], min_topic_count + n)
......@@ -327,7 +322,10 @@ def format_topics_sentences(ldamodel, corpus, texts):
def output_topics(ldamodel, corpus, fd=sys.stdout, number_of_topics=10):
top_topics = {}
for topic_number in range(number_of_topics):
print("####################################################")
print("number of topics: " + str(number_of_topics))
for topic_number in range(ldamodel.num_topics):
word_probability = ldamodel.show_topic(topic_number)
top_topics[topic_number] = [word for word, probablity in word_probability]
......@@ -347,16 +345,15 @@ document_words_no_stopwords = remove_stopwords(corpus_pre_stopwords)
#logging.debug(len(document_words_no_stopwords))
logging.debug("stopwords: " + str(STOPWORDS))
if (CACHED):
logging.info("Loading models from disc")
models = load_models(CACHE_DIRECTORY, MAX_TOPIC_COUNT - MIN_TOPIC_COUNT - 1)
# if (CACHED):
# logging.info("Loading models from disc")
# models = load_models(CACHE_DIRECTORY, MAX_TOPIC_COUNT - MIN_TOPIC_COUNT - 1)
logging.debug("First document (Document Without Stopwords): {0}".format(corpus_no_stopwords[:1][0]))
# logging.debug("First document (Document Without Stopwords): {0}".format(document_words_no_stopwords[:1][0]))
data_words_n_gram = make_bigrams_and_trigrams(document_words_no_stopwords)
#logging.debug("First document (Bigrams): {0}".format(data_words_bigrams[:1][0]))
data_lemmatized = lemmatization(data_words_bigram, allowed_postags=ALLOWED_POS_TAGS) # Lemmatise the text
data_lemmatized = lemmatization(data_words_n_gram, allowed_postags=ALLOWED_POS_TAGS) # Lemmatise the text
#logging.debug("First document (lemmatised): {0}".format(pformat(data_lemmatized[:1][0])))
......@@ -374,7 +371,8 @@ for n in range(len(corpus_Term_Document_Frequency[:2])):
logging.debug("Term frequency for document: {0}: {1}".format(n, term_and_freq))
topic_models = generate_models(corpus_Term_Document_Frequency, id2word, MIN_TOPIC_COUNT, MAX_TOPIC_COUNT)
save_models(topic_models, CACHE_DIRECTORY, MIN_TOPIC_COUNT)
# save_models(topic_models, CACHE_DIRECTORY, MIN_TOPIC_COUNT)
......@@ -386,16 +384,16 @@ save_models(topic_models, CACHE_DIRECTORY, MIN_TOPIC_COUNT)
logging.debug("{0}d {1}".format(topic_number, maximally_coherent_model))
log_topic_keywords(models, MIN_TOPIC_COUNT, MAX_TOPIC_COUNT)
log_topic_keywords(topic_models, MIN_TOPIC_COUNT, MAX_TOPIC_COUNT)
logging.info("Maximally coherent topic: {0}d".format(topic_number))
log_topic_n_keywords(maximally_coherent_model, topic_number)
logging.info("Maximally coherent topic: {0}".format(topic_number))
#log_topic_keywords(maximally_coherent_model, topic_number)
t = MAX_TOPIC_COUNT - MIN_TOPIC_COUNT - 1
output_topics(maximally_coherent_model, corpus_Term_Document_Frequency, number_of_topics=t)
pandas_dataframe_topic_sents_keywords = format_topics_sentences(maximally_coherent_model, corpus_Term_Document_Frequency, corpus_no_stopwords)
pandas_dataframe_topic_sents_keywords = format_topics_sentences(maximally_coherent_model, corpus_Term_Document_Frequency, document_words_no_stopwords)
logging.info("Topic sentence keywords:\n{0}".format(pandas_dataframe_topic_sents_keywords))
......@@ -439,9 +437,9 @@ topic_contribution = round(topic_counts/topic_counts.sum(), 4) # Percentage of d
topic_num_keywords = pandas_dataframe_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']] # Topic number and keywords
pandas_dataframe_dominant_topics = pandas.concat([topic_num_keywords, topic_counts, topic_counts, topic_contribution], axis=1) # Concantenate columns
pandas_dataframe_dominant_topics = pandas.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1) # Concantenate columns
pandas_dataframe_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']
pandas_dataframe_dominant_topics.columns = ['Topic_Number_Keywords', 'Dominant_Topic', 'Num_Documents', 'Perc_Documents']
pandas_dataframe_dominant_topics
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment