Commit 8dfcb73c authored by Gabriel Birnbaum's avatar Gabriel Birnbaum

final push of the day

parent 3dd4ae1c
User-ID,Article-ID,Article-Rating
123,12345,10
124,12393,5
125,4393,3
126,1238,8
127,44838,10
128,32783,9
129,23882,1
130,38293,2
128,12345,8
128,12393,9
129,4393,3
129,1238,5
130,12393,8
130,12393,7
Article-ID,Article-Title,Article-Author,Year-Of-Publication,Source,Number-Of-Words,Difficulty
12345,Title 12345,Author 12345,2018,MEDIUM,50,20
12393,Title 12393,Author 1283,2000,MEDIUM,200,30
4393,Title 4393,Author 12339,2009,MEDIUM,300,15
1238,Title 1238,Author 4392,2011,MEDIUM,500,50
44838,Title 44838,Author 3928,2011,MEDIUM,1000,5
32783,Title 32783,Author 19283,2017,MEDIUM,333,19
23882,Title 23882,Author 87372,2015,MEDIUM,504,23
38293,Title 38293,Author 3782,2005,MEDIUM,843,33
User-ID
123
124
125
126
127
128
129
130
"""
A K-Nearest-Neighbors implementation from https://towardsdatascience.com/how-did-we-build-book-recommender-systems-in-an-hour-part-2-k-nearest-neighbors-and-matrix-c04b3c2ef55c
"""
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from pathlib import Path
import pandas as pd
import numpy as np
import sys
import os
PATH = Path(__file__).resolve()
cur_path = str(PATH.parents[0])
src_path = str(PATH.parents[1])
sys.path.append(src_path)
class kNN(object):
"""
K-Nearest Neighbor implementation for the Data.KI.Bots.Hackathon in Mannheim, Germany, 2018.
"""
def __init__(self, threshold, userID, neighbors, articleID=None):
self.pop_threshold = threshold
self.userID = userID
self.neighbors = neighbors
self.articleID = articleID
self.recommendations = []
def pd_csv_reader(self, filename):
return pd.read_csv(os.path.join(cur_path, filename), sep=',', error_bad_lines=False, encoding="latin-1")
def process_data(self):
# Read csv files
article = self.pd_csv_reader('Articles.csv')
user = self.pd_csv_reader('Users.csv')
rating = self.pd_csv_reader('Article-Ratings.csv')
# Set new column headers
article.columns = ['articleID', 'articleTitle', 'articleAuthor', 'yearOfPublication', 'source', 'numWords',
'difficulty']
user.columns = ['userID']
rating.columns = ['userID', 'articleID', 'articleRating']
# Combine dataframes and remove specified columns
combine_article_rating = pd.merge(rating, article, on='articleID')
toDrop = ['yearOfPublication', 'source', 'articleAuthor', 'numWords', 'difficulty']
combine_article_rating = combine_article_rating.drop(toDrop, axis=1)
# Remove all but title along axis 0
combine_article_rating = combine_article_rating.dropna(axis=0, subset=['articleID'])
# Group by titles and create a new column for total rating count
article_ratingCount = (combine_article_rating
.groupby(by=['articleID'])['articleRating']
.count()
.reset_index()
.rename(columns={'articleRating': 'totalRatingCount'})[['articleID', 'totalRatingCount']])
# Combine dataframes by columns preceding and following title
rating_with_totalRatingCount = combine_article_rating.merge(article_ratingCount, left_on='articleID',
right_on='articleID', how='left')
# Display outputs as floats
pd.set_option('display.float_format', lambda x: '%.3f' % x)
#
threshold = int(self.pop_threshold) # hypterparameter
rating_popular_article = rating_with_totalRatingCount.query('totalRatingCount >= @threshold')
combined = rating_popular_article.merge(user, left_on='userID', right_on='userID', how='left')
return combined
def get_user_rating_pivot(self, lst):
# Process dataframe
data = self.process_data()
# Remove duplicates
user_rating = data.drop_duplicates(lst)
# Fill empty items in with zeros
user_rating_pivot = user_rating.pivot(index='articleID', columns='userID', values='articleRating').fillna(0)
# Compress matrix
user_rating_pivot_matrix = csr_matrix(user_rating_pivot.values)
return user_rating_pivot_matrix, user_rating_pivot
def get_recommendations(self, distances, indices, user_rating_pivot, query_index):
recommendations = []
for i in range(0, len(distances.flatten())):
if i == 0:
print("Recommendations for {}:\n".format(user_rating_pivot.index[query_index]))
else:
# print("{}: {}, with distances of {}".format(i, user_rating_pivot.index[indices.flatten()[i]],
# distances.flatten()[i]))
recommendations.append((user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))
return recommendations
def get_pivot_row_elem_idx(self, elem, pivot):
# Assumes no duplicates in pivot df
for i in range(len(pivot.index)):
if pivot.index[i] == elem:
return i
def main(self):
if self.articleID != None:
user_rating_pivot_matrix, user_rating_pivot = self.get_user_rating_pivot(['userID', 'articleID'])
else:
user_rating_pivot_matrix, user_rating_pivot = self.get_user_rating_pivot(['userID', 'articleTitle'])
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(user_rating_pivot_matrix)
query_index = self.get_pivot_row_elem_idx(np.int64(4393), user_rating_pivot)
# query_index = np.random.choice(user_rating_pivot.shape[0]) # Random query index
distances, indices = model_knn.kneighbors(user_rating_pivot.iloc[query_index, :].values.reshape(1, -1),
n_neighbors=self.neighbors)
self.recommendations = self.get_recommendations(distances, indices, user_rating_pivot, query_index)
print(self.recommendations)
if __name__ == '__main__':
knn = kNN(threshold=1, userID=12345, neighbors=5)
knn.main()
numpy==1.14.2
pandas==0.22.0
python-dateutil==2.7.2
pytz==2018.4
scikit-learn==0.19.1
scipy==1.0.1
six==1.11.0
sklearn==0.0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment