Commit 355dc3f5 authored by Lucas Moura's avatar Lucas Moura

Imported Upstream version 0.5.4

parent b5d38b82
......@@ -4,24 +4,13 @@ AppRecommender - Application recommender for GNU/Linux systems
Install dependencies
---------------------
# apt-get install python python-xapian python-apt python-cluster python-webpy python-simplejson python-numpy apt-xapian-index python-xdg debtags python-pip python-sklearn python-nltk python-matplotlib -y
# apt-get install python python-xapian python-apt python-cluster python-simplejson python-numpy apt-xapian-index python-xdg debtags python-pip python-sklearn python-matplotlib -y
sudo update-apt-xapian-index
pip install setuptools
Run AppRecommender web UI
--------------------------
$ cd ./src/web
$ ./server.py
Open a browser and access http://localhost:8080
More info at https://github.com/tassia/AppRecommender/wiki
Run AppRecommender in Terminal
------------------------------
......
......@@ -185,7 +185,7 @@ class Config(Singleton):
Configure application logger and log level.
"""
self.logger = logging.getLogger('') # root logger is used by default
self.logger.setLevel(logging.DEBUG)
self.logger.setLevel(logging.INFO)
if self.debug == 1:
log_level = logging.DEBUG
......
# Config file for AppRecommender
[general]
# logging level
debug = 0
verbose = 0
# output file
output = /dev/null
[data_sources]
# path to apt-xapian-index
axi = /var/lib/apt-xapian-index/index
# DDE url
dde_url = http://dde.debian.net/dde/q/udd/packages/all/%s?t=json
# old, reindex, cluster, recluster
index_mode = old
# path to popcon index
popcon_index = ~/.app-recommender/popcon_index
# path to popcon submissions dir
popcon_dir = ~/.app-recommender/popcon_dir
# path to popcon clusters dir
clusters_dir = ~/.app-recommender/clusters_dir
# number of medoids for clustering
k_medoids = 100
[recommender]
# recommendation strategy
strategy = cb
# search weighting scheme ('trad' or 'bm25')
weight = bm25
# user profile size
profile_size = 50
#!/usr/bin/env python
"""
AppRecommender - A GNU/Linux application recommender
"""
__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
__license__ = """
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import sys
sys.path.insert(0, '../')
import logging
import datetime
from apprecommender.config import Config
from apprecommender.recommender import Recommender
from apprecommender.user import LocalSystem
from apprecommender.error import Error
if __name__ == '__main__':
try:
cfg = Config()
rec = Recommender(cfg)
user = LocalSystem()
begin_time = datetime.datetime.now()
logging.debug("Recommendation computation started at %s" % begin_time)
print rec.get_recommendation(user)
end_time = datetime.datetime.now()
logging.debug("Recommendation computation completed at %s" % end_time)
delta = end_time - begin_time
logging.info("Time elapsed: %d seconds." % delta.seconds)
except Error:
logging.critical("Aborting proccess. Use '--debug' for more details.")
#!/usr/bin/env python
"""
Clustering - A python script to perform clustering of popcon data.
"""
__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
__license__ = """
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import sys
sys.path.insert(0, '../')
import logging
import datetime
from apprecommender.config import Config
from apprecommender.data import PopconXapianIndex
from apprecommender.error import Error
if __name__ == '__main__':
try:
cfg = Config()
begin_time = datetime.datetime.now()
logging.info("Clustering computation started at %s" % begin_time)
pxi = PopconXapianIndex(cfg)
end_time = datetime.datetime.now()
logging.info("Clustering computation completed at %s" % end_time)
delta = end_time - begin_time
logging.info("Time elapsed: %d seconds." % delta.seconds)
logging.info("Medoids: %d\tDispersion:%f" %
(cfg.k_medoids, pxi.cluster_dispersion))
except Error:
logging.critical("Aborting proccess. Use '--debug' for more details.")
#!/usr/bin/env python
"""
CrossValidation - python module for classes and methods related to
recommenders evaluation.
"""
__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
__license__ = """
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import sys
sys.path.insert(0, '../')
import logging
import datetime
from apprecommender.config import Config
from apprecommender.evaluation import (Precision, Recall, F1, Accuracy,
SimpleAccuracy, CrossValidation)
from apprecommender.recommender import Recommender
from apprecommender.user import LocalSystem
from apprecommender.error import Error
if __name__ == '__main__':
try:
cfg = Config()
rec = Recommender(cfg)
print "\nRecommender strategy: ", rec.strategy.description
user = LocalSystem()
begin_time = datetime.datetime.now()
logging.debug("Cross-validation started at %s" % begin_time)
metrics = []
metrics.append(Precision())
metrics.append(Recall())
metrics.append(F1())
metrics.append(Accuracy())
metrics.append(SimpleAccuracy())
validation = CrossValidation(0.9, 10, rec, metrics, 0.1)
validation.run(user)
print validation
end_time = datetime.datetime.now()
logging.debug("Cross-validation completed at %s" % end_time)
delta = end_time - begin_time
logging.info("Time elapsed: %d seconds." % delta.seconds)
except Error:
logging.critical("Aborting proccess. Use '--debug' for more details.")
#!/usr/bin/env python
"""
DemoRecommender - demonstration of a GNU/Linux application recommender.
"""
__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
__license__ = """
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import os
import sys
import commands
import re
import xapian
from debian import debtags
from apprecommender.strategy import PkgMatchDecider
DB_PATH = "/var/lib/debtags/package-tags"
INDEX_PATH = os.path.expanduser("~/.app-recommender/debtags_index")
def load_debtags_db(path):
""" Load debtags database. """
debtags_db = debtags.DB()
tag_filter = re.compile(r"^special::.+$|^.+::TODO$")
try:
debtags_db.read(open(path, "r"), lambda x: not tag_filter.match(x))
except IOError:
print >> sys.stderr, ("IOError: could not open debtags file \'%s\'" %
path)
exit(1)
return debtags_db
def get_system_pkgs():
""" Return set of system packages. """
dpkg_output = commands.getoutput('/usr/bin/dpkg --get-selections')
return dpkg_output.replace('install', '\t').split()
def get_most_relevant_tags(debtags_db, pkgs_list):
""" Return most relevant tags considering a list of packages. """
relevant_db = debtags_db.choose_packages(pkgs_list)
relevance_index = debtags.relevance_index_function(debtags_db, relevant_db)
sorted_relevant_tags = sorted(relevant_db.iter_tags(),
lambda a, b: cmp(relevance_index(a),
relevance_index(b)))
return normalize_tags(' '.join(sorted_relevant_tags[-50:]))
def normalize_tags(string):
""" Normalize tag string so that it can be indexed and retrieved. """
return string.replace(':', '_').replace('-', '\'')
def create_debtags_index(debtags_db, index_path):
""" Create a xapian index for debtags info based on file 'debtags_db' and
place it at 'index_path'.
"""
if not os.path.exists(index_path):
os.makedirs(index_path)
print "Creating new debtags xapian index at \'%s\'" % index_path
debtags_index = xapian.WritableDatabase(index_path,
xapian.DB_CREATE_OR_OVERWRITE)
for pkg, tags in debtags_db.iter_packages_tags():
doc = xapian.Document()
doc.set_data(pkg)
for tag in tags:
doc.add_term(normalize_tags(tag))
print "indexing ", debtags_index.add_document(doc)
return debtags_index
def load_debtags_index(debtags_db, reindex):
""" Load an existing or new debtags index, based on boolean reindex. """
if not reindex:
try:
print ("Opening existing debtags xapian index at \'%s\'" %
INDEX_PATH)
debtags_index = xapian.Database(INDEX_PATH)
except xapian.DatabaseError:
print "Could not open debtags xapian index"
reindex = 1
if reindex:
debtags_index = create_debtags_index(debtags_db, INDEX_PATH)
return debtags_index
if __name__ == '__main__':
reindex = 0
if len(sys.argv) == 2:
DB_PATH = sys.argv[1]
reindex = 1
print "reindex true"
elif len(sys.argv) > 2:
print >> sys.stderr, ("Usage: %s [PATH_TO_DEBTAGS_DATABASE]" %
sys.argv[0])
sys.exit(1)
debtags_db = load_debtags_db(DB_PATH)
installed_pkgs = get_system_pkgs()
best_tags = get_most_relevant_tags(debtags_db, installed_pkgs)
debtags_index = load_debtags_index(debtags_db, reindex)
qp = xapian.QueryParser()
query = qp.parse_query(best_tags)
enquire = xapian.Enquire(debtags_index)
enquire.set_query(query)
mset = enquire.get_mset(0, 20, None, PkgMatchDecider(installed_pkgs))
for m in mset:
print "%2d: %s" % (m.rank, m.document.get_data())
#!/usr/bin/env python
"""
Clustering - A python script to perform clustering of popcon data.
"""
__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
__license__ = """
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import sys
sys.path.insert(0, '../')
import logging
import datetime
from config import Config
from data import PopconXapianIndex
from error import Error
if __name__ == '__main__':
try:
cfg = Config()
begin_time = datetime.datetime.now()
logging.info("Popcon indexing started at %s" % begin_time)
pxi = PopconXapianIndex(cfg)
end_time = datetime.datetime.now()
logging.info("Popcon indexing completed at %s" % end_time)
delta = end_time - begin_time
logging.info("Time elapsed: %d seconds." % delta.seconds)
if cfg.index_mode == "cluster" or cfg.index_mode == "recluster":
logging.info("Medoids: %d\tDispersion:%f" %
(cfg.k_medoids, pxi.cluster_dispersion))
except Error:
logging.critical("Aborting proccess. Use '--debug' for more details.")
......@@ -61,7 +61,7 @@ class PkgClassificationTests(unittest.TestCase):
pkgs = {'vim': 'EX'}
debtags_name = ['devel::editor', 'implemented-in::c',
'devel::interpreter', 'devel::lang:python']
terms_name = ['contain', 'syntax', 'python']
terms_name = ['vim', 'editor', 'python']
assert_pkgs_classification = {'vim': [1, 1, 0, 0, 1, 1, 0, 'EX']}
......
This diff is collapsed.
sudo apt-get install python-xapian python-cluster python-simplejson python-numpy apt-xapian-index debtags -y
sudo update-apt-xapian-index
#!/usr/bin/env python
import commands
from load_data import get_folder_path, get_all_folders_path
def get_cross_validations_path(folders_path):
files = []
for folder_path in folders_path:
all_files = commands.getoutput(
"ls {}".format(folder_path)).splitlines()
files += [folder_path + f for f in all_files
if f.startswith('cross_validation_result')]
return files
def get_metrics_values(files_path):
metrics_values = {'S_Accuracy': [], 'Precision': [], 'Recall': [],
'FPR': [], 'F(1.0)': []}
for file_path in files_path:
with open(file_path, 'rb') as text:
lines = [line.strip() for line in text]
for line in lines:
line_split = line.split(':')
metric = line_split[0].strip()
if metric in metrics_values.keys() and len(line_split[1]) > 0:
value = float(line_split[1])
metrics_values[metric].append(value)
return metrics_values
def convert_to_csv(metrics_values):
rows = []
metrics = ';'.join(metrics_values.keys())
rows.append(metrics)
for index in range(len(metrics_values.values()[0])):
row = []
for metric in metrics_values.keys():
row.append(metrics_values[metric][index])
row = ';'.join(str(element) for element in row)
rows.append(row)
return rows
def main():
folder_path = get_folder_path()
all_folders_path = get_all_folders_path(folder_path)
files_path = get_cross_validations_path(all_folders_path)
metrics_values = get_metrics_values(files_path)
csv_rows = convert_to_csv(metrics_values)
for row in csv_rows:
print row
if __name__ == '__main__':
main()
import commands
import os
import sys
def get_folder_path():
usage_message = "Usage: {} [folder_path]".format(sys.argv[0])
if len(sys.argv) < 2:
print usage_message
exit(1)
folder_path = sys.argv[1]
folder_path = os.path.expanduser(folder_path)
if not folder_path.endswith('/'):
folder_path += '/'
if not os.path.exists(folder_path):
print usage_message
print "Folder do not exist"
exit(1)
return folder_path
def get_all_folders_path(folder_path):
folders_path = commands.getoutput("ls {}".format(folder_path)).splitlines()
folders_path = [folder for folder in folders_path
if folder.startswith('app_recommender_log')]
folders_path = ["{}{}/".format(folder_path, folder)
for folder in folders_path]
return folders_path
def get_csv_file_path():
usage_message = "Usage: {} [csv_file_path]".format(sys.argv[0])
if len(sys.argv) < 2:
print usage_message
exit(1)
csv_file_path = sys.argv[1]
csv_file_path = os.path.expanduser(csv_file_path)
if not os.path.exists(csv_file_path):
print usage_message
print "CSV file not exists"
exit(1)
return csv_file_path
def get_lines_from_csv_file(csv_file_path):
with open(csv_file_path, 'rb') as text:
lines = [line.strip() for line in text]
lines = [line.split(';') for line in lines]
return lines
#!/usr/bin/env python
import commands
from load_data import get_folder_path, get_all_folders_path
def load_user_preferences(folder_path):
preferences_file = "{}user_preferences.txt".format(folder_path)
user_preferences = {}
with open(preferences_file, 'rb') as text:
lines = [line.strip() for line in text]
user_preferences = dict([(line.split(':')[0], int(line.split(':')[1]))
for line in lines])
return user_preferences
def load_strategies(folder_path):
all_files = commands.getoutput("ls {}".format(folder_path)).splitlines()
files = [f for f in all_files if f.endswith('recommendation.txt')]
strategies = {}
strategy_names = [f.split('_')[0] for f in files]
for strategy in strategy_names:
strategy_file = "{}{}_{}".format(folder_path, strategy,
'recommendation.txt')
with open(strategy_file, 'rb') as text:
strategies[strategy] = [line.strip() for line in text]
return strategies
def load_pc_informations(folder_path):
all_files = commands.getoutput("ls {}".format(folder_path)).splitlines()
files = [f for f in all_files if f.endswith('informations.txt')]
informations = {}
pc_informations_file = '{}{}'.format(folder_path, files[0])
valid_info = set(['distributor_id', 'codename'])
with open(pc_informations_file, 'rb') as text:
for line in text:
if ':' not in line:
continue
info = line.split(':')
info[0] = info[0].lower().replace(' ', '_')
if info[0] in valid_info:
informations[info[0]] = info[1].strip()
return informations
def get_strategies_score(strategies, user_preferences):
classifications = {1: 'bad', 2: 'redundant', 3: 'useful',
4: 'useful_surprise'}
strategies_score = {}
for strategy, pkgs in strategies.iteritems():
strategies_score[strategy] = {'bad': 0, 'redundant': 0, 'useful': 0,
'useful_surprise': 0}
for pkg in pkgs:
classification = classifications[user_preferences[pkg]]
strategies_score[strategy][classification] += 1
return strategies_score
def print_strategies_score(strategies_score):
classifications = ['bad', 'redundant', 'useful', 'useful_surprise']
for strategy, score in strategies_score.iteritems():
print "\nStrategy: {}".format(strategy)
for classification in classifications:
print " {}: {}".format(classification, score[classification])
print '\n'
def get_all_strategies_score(all_folders_path):
all_strategies_score = []
for folder_path in all_folders_path:
strategies = load_strategies(folder_path)
user_preferences = load_user_preferences(folder_path)
strategies_score = get_strategies_score(strategies, user_preferences)
all_strategies_score.append(strategies_score)
return all_strategies_score
def get_all_pc_informations(all_folders_path):
all_pc_informations = []
for folder_path in all_folders_path:
pc_information = load_pc_informations(folder_path)
all_pc_informations.append(pc_information)
return all_pc_informations
def convert_to_csv(all_strategies_score, all_pc_informations):
rows = []
possible_strategies = sorted(all_strategies_score[0].keys())
pc_info_header = sorted(all_pc_informations[0].keys())
classifications = ['bad', 'redundant', 'useful', 'useful_surprise']
csv_header = ""
for strategy in possible_strategies:
for classification in classifications:
csv_header += '{}_{};'.format(strategy, classification)
for info in pc_info_header:
csv_header += '{};'.format(info)
rows.append(csv_header[:-1])
for strategies_score in all_strategies_score:
row = []
for strategy, scores in sorted(strategies_score.items()):
for classification in classifications:
row.append(scores[classification])
row = ';'.join(str(element) for element in row)
rows.append(row)
index = 1
for pc_informations, row in zip(all_pc_informations, rows[1:]):
distributor_id = pc_informations['distributor_id']
codename = pc_informations['codename']
row = row + ';{};{}'.format(codename, distributor_id)
rows[index] = row
index += 1
return rows
def main():
folder_path = get_folder_path()
all_folders_path = get_all_folders_path(folder_path)
all_strategies_score = get_all_strategies_score(all_folders_path)
all_pc_informations = get_all_pc_informations(all_folders_path)
csv_rows = convert_to_csv(all_strategies_score, all_pc_informations)
for row in csv_rows:
print row
if __name__ == '__main__':
main()
#!/usr/bin/env python
import matplotlib.pyplot as plt
import numpy as np
from load_data import get_csv_file_path, get_lines_from_csv_file
def plot_cross_validation_averages(metrics_values):