Commit ee92b468 authored by MessikommerNico's avatar MessikommerNico

Merge branch 'feature/KFoldPrediction' into 'master'

Added KFold validation experiments and test experiments / Added KFold Visualisation prediction

See merge request analytics-club/hack4good/fs19/team-2!14
parents 2046ab94 ca5c4e81
......@@ -24,7 +24,7 @@ class RandomForest:
"""Initialises a new model. An already existing random forest model will be overwritten"""
self.model = sklearn.ensemble.RandomForestClassifier(n_estimators=50,
n_jobs=-1,
verbose=1,
verbose=0,
class_weight='balanced')
def saveModel(self, time_name):
......@@ -66,7 +66,59 @@ def loadData(feature_target_list, nr_target_variables, categorial_list, need_thr
return one_hot_data, clean_data_input
def runExperiment(X_train, X_test, y_train, y_test, list_features, target_variable):
def runExperiment(X_train, X_test, y_train):
"""Fits and predicts a random forest model"""
forest_classifier = RandomForest()
forest_classifier.initialiseModel()
start_train_time = time.time()
forest_classifier.model.fit(X_train, y_train)
training_time = time.time() - start_train_time
print('Training Time: %s ' % "{0:.2f}".format(training_time))
start_test_time = time.time()
y_test_predicted = forest_classifier.model.predict(X_test)
predict_time = time.time() - start_test_time
# forest_classifier.saveModel(time_name=time.strftime("%Y%m%d-%H%M%S").time)
return y_test_predicted, training_time, predict_time
def runKFTraining(X_train, y_train, list_features, target_variable):
kf = sklearn.model_selection.KFold(n_splits=5, shuffle=True, random_state=1234)
y_predicted_dataset = np.empty([0])
y_ground_truth_dataset = np.empty([0])
training_time_dataset = np.empty([0])
predict_time_dataset = np.empty([0])
for train_idx, val_idx in kf.split(X_train):
output = runExperiment(X_train[train_idx, :], X_test=X_train[val_idx, :], y_train=y_train[train_idx])
y_test_predicted, training_time, predict_time = output
y_predicted_dataset = np.concatenate((y_predicted_dataset, y_test_predicted))
y_ground_truth_dataset = np.concatenate((y_ground_truth_dataset, y_train[val_idx]))
# ---- Create Writer for Storing a Summary of the experiment ----
report_writer = experiment_report.ReportWriter(path_dir=os.path.join('results', 'models', 'random_forest'))
report_writer.person_name = 'Nico'
report_writer.workstation = 'Lenovo-ThinkPad-T460p'
report_writer.model = 'Random Forest / KFold'
# ---- Save Results ----
report_writer.target_variable = target_variable
report_writer.feature_list = list_features
report_writer.nr_train_samples = X_train.shape[0]
report_writer.nr_test_samples = X_train.shape[0]
report_writer.time = time.strftime("%Y%m%d-%H%M%S")
report_writer.time_training = np.mean(training_time_dataset)
report_writer.time_predict = np.mean(predict_time_dataset)
report_writer.saveReport(y_predicted=y_predicted_dataset, y_ground_truth=y_ground_truth_dataset)
def runTestExtperiment(X_train, X_test, y_train, y_test, list_features, target_variable):
"""Fits and predicts a random forest model"""
# ---- Create Writer for Storing a Summary of the experiment ----
report_writer = experiment_report.ReportWriter(path_dir=os.path.join('results', 'models', 'random_forest'))
......@@ -100,18 +152,16 @@ def runExperiment(X_train, X_test, y_train, y_test, list_features, target_variab
report_writer.saveReport(y_predicted=y_test_predicted, y_ground_truth=y_test)
# forest_classifier.saveModel(time_name=report_writer.time)
return sklearn.metrics.accuracy_score(y_test, y_test_predicted)
def runMultipleExeriments(feature_list, target_prediction, categorial_list, need_threshold):
def runMultipleExeriments(feature_list, target_prediction, categorial_list, need_threshold, parameter_tuning=True):
"""Runs multiple experiments for the specified target_predictions and saves the result to a .csv file"""
# ---- Load and Split Data ----
data_input, _ = loadData(feature_list + target_prediction, len(target_prediction), categorial_list, need_threshold)
random_state = 42
random_state = 1234
nr_target_predictions = len(target_prediction)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data_input[:, :-nr_target_predictions],
data_input[:, -nr_target_predictions:],
test_size=0.33,
test_size=0.2,
random_state=random_state,
shuffle=True)
......@@ -122,24 +172,56 @@ def runMultipleExeriments(feature_list, target_prediction, categorial_list, need
# 'Shelter_pred', 'Livelihood_pred', 'Recovery_pred', 'Health_pred', 'Nutrition_pred', 'Education_pred',
# 'Protection_pred']
# input_dataframe = pd.DataFrame(np.random.randint(0, 2, size=(100, len(columns))), columns=columns)
# out_dir = 'results/final_evaluation'
# statistics.createStats(input_dataframe, out_dir=out_dir, model='random_forest')
if parameter_tuning:
for target_idx in range(nr_target_predictions):
runKFTraining(X_train, y_train[:, target_idx], feature_list, target_prediction[target_idx])
else:
for target_idx in range(nr_target_predictions):
runTestExtperiment(X_train, X_test, y_train[:, target_idx], y_test[:, target_idx],
feature_list, target_prediction[target_idx])
def runPredictionKFold(feature_list, target_prediction, categorial_list, need_threshold):
feature_list = feature_list
data_input, clean_data = loadData(feature_list + target_prediction, len(target_prediction), categorial_list,
need_threshold)
parent_index = clean_data['_parent_index'].values
ext_data_input = np.concatenate((parent_index[:, np.newaxis], data_input), axis=1)
nr_target = len(target_prediction)
output_prediction_df = pd.DataFrame(data=parent_index, columns=['_parent_index'])
for target_idx, sector_name in enumerate(target_prediction):
print('Predicting Sector %s' % sector_name)
sector_df = fitKFold(ext_data_input[:, :-nr_target], ext_data_input[:, target_idx - nr_target], sector_name)
output_prediction_df = output_prediction_df.merge(sector_df, how='outer', on='_parent_index')
def fitKFold(x_data, y_gt, sector_name):
kf = sklearn.model_selection.KFold(n_splits=5, shuffle=True, random_state=1234)
forest_classifier = RandomForest()
forest_classifier.initialiseModel()
pin_predictions = np.empty([0])
parent_index = np.empty([0])
nr_samples = x_data.shape[0]
df = pd.DataFrame(index=range(nr_samples), columns=['_parent_index', sector_name + '_pred'])
for train_idx, val_idx in kf.split(x_data):
forest_classifier.model.fit(x_data[train_idx, 1:], y_gt[train_idx])
output_predicted = forest_classifier.model.predict(x_data[val_idx, 1:])
score_list = []
for target in range(nr_target_predictions):
score = runExperiment(X_train, X_test, y_train[:, target], y_test[:, target],
feature_list, target_prediction[target])
score_list.append('{0:.4f}'.format(score))
pin_predictions = np.concatenate((pin_predictions, output_predicted), axis=0)
parent_index = np.concatenate((parent_index, x_data[val_idx, 0]))
df[sector_name + '_pred'] = pin_predictions
df['_parent_index'] = parent_index
# def runPredictionKFold(feature_list, target_prediction, categorial_list, need_threshold):
# feature_list = feature_list
# data_input, clean_data = loadData(feature_list + target_prediction, len(target_prediction), categorial_list,
# need_threshold)
# print(clean_data)
# print(data_input[:10, 0])
return df
def main():
......@@ -174,9 +256,9 @@ def main():
'HeadOfHouseholdMaritalStatus', 'TribeOfHousehold']
need_threshold = 4
# runMultipleExeriments(feature_list, target_prediction, categorial_list, need_threshold)
runMultipleExeriments(feature_list, target_prediction, categorial_list, need_threshold, parameter_tuning=True)
runPredictionKFold(feature_list, target_prediction, categorial_list, need_threshold)
# runPredictionKFold(feature_list, target_prediction, categorial_list, need_threshold)
if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment