Commit 0b241896 authored by MessikommerNico's avatar MessikommerNico

Added evulation script

parent 70182cf2
......@@ -6,14 +6,12 @@
# - Reading in the first geotif_dataset (clean geotif_dataset final)
# - Calculating the PiN based on Annex 6 (pg. 124) of Nigeria's reach report (see Excel PiN_calculation_details.xlsx for more details)
# In[1]:
import numpy as np
import pandas as pd
# In[2]:
def letter2num(letters, zbase=True):
......@@ -29,7 +27,6 @@ def letter2num(letters, zbase=True):
return res - 1
# In[3]:
def calculate_PiN(data=None, source_dir="../../data/raw"):
# data = pd.ExcelFile('/Users/Vicky/Documents/Documents/Studium/IMPACT project/Data/reach_nga_msna_clean_dataset_final.xlsx')
......@@ -38,9 +35,6 @@ def calculate_PiN(data=None, source_dir="../../data/raw"):
sheets = data.sheet_names
# In[4]:
# Put each excel sheet in a separate dataframe
readme = data.parse(0)
......@@ -163,7 +157,7 @@ def calculate_PiN(data=None, source_dir="../../data/raw"):
0.5 * hh_data.iloc[:,letter2num('NJ')].values +
0.5 * hh_data.iloc[:,letter2num('NK')].values )
PiN_livelihood = (3. * (FCS<21.) +
PiN_food = (3. * (FCS<21.) +
2. * ((FCS>21.) & (FCS<35.)) )
# HH has a high use on reduced Coping Strategy Index
......@@ -174,95 +168,97 @@ def calculate_PiN(data=None, source_dir="../../data/raw"):
3. * hh_data.iloc[:,letter2num('NP')].values +
1. * hh_data.iloc[:,letter2num('NQ')].values )
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
3. * (rCSI>9.99) )
# HH reports using unsafe/unsustainable fuel for cooking
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
0.33 * ((hh_data.iloc[:,letter2num('OY')].values == 'Firewood') |
(hh_data.iloc[:,letter2num('OY')].values == 'Animal dung') |
(hh_data.iloc[:,letter2num('OY')].values == 'Agricultural waste / crop residue')) )
# HH reports using unsafe/unsustainable fuel for lighting
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
0.33 * ((hh_data.iloc[:,letter2num('PA')].values == 'Firewood') |
(hh_data.iloc[:,letter2num('PA')].values == 'None')) )
# HH reports using unsafe/unsustainable method for cooking
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
0.33 * (hh_data.iloc[:,letter2num('PS')].values == 'Three-stone fire') )
# HH reports unsafe/ unsustainable means of obtaining primary fuel source
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
0.33 * ((hh_data.iloc[:,letter2num('PC')].values == 'Collect directly from outside the community') |
(hh_data.iloc[:,letter2num('PC')].values == 'Trade goods or items for fuel') |
(hh_data.iloc[:,letter2num('PC')].values == 'From NGO aid / assistance')) )
# HH reports resorting to negative fuel coping strategies
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
2. * ((hh_data.iloc[:,letter2num('PX')].values == 'Yes') |
(hh_data.iloc[:,letter2num('QH')].values == 'Yes')) )
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
0.33 * ((hh_data.iloc[:,letter2num('PZ')].values == 'Yes') |
(hh_data.iloc[:,letter2num('QD')].values == 'Yes') |
(hh_data.iloc[:,letter2num('QF')].values == 'Yes') |
(hh_data.iloc[:,letter2num('QG')].values == 'Yes') ) )
# HH reports no access to markets
PiN_livelihood = 1. * (hh_data.iloc[:,letter2num('OF')].values == 2.) # '2' = 'No'
PiN_food = ( PiN_food +
1. * (hh_data.iloc[:,letter2num('OF')].values == 2.) ) # '2' = 'No'
# HH reports market-related barriers to accessing food items
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
1. * (hh_data.iloc[:,letter2num('OG')].values == 'No') )
# HH was reportedly not able to plant / harvest last dry season
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
0.5 * ((hh_data.iloc[:,letter2num('UB')].values == "Didn't plant or harvest") |
(hh_data.iloc[:,letter2num('UB')].values == 'Planted but did not harvest anything')) )
# HH reports not planning to cultivate this rainy season
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
0.5 * (hh_data.iloc[:,letter2num('UC')].values == 'No, will not plant or harvest this rainy season') )
# HH reports not accessing: amount of land needed / land at all
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
0.5 * (hh_data.iloc[:,letter2num('UG')].values == 'No, did not access any land') )
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
0.25 * (hh_data.iloc[:,letter2num('UG')].values == 'Yes, but did not access amount of land needed') )
# HH reports not accessing: amount of water needed / water at all
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
0.5 * (hh_data.iloc[:,letter2num('UJ')].values == 'No, did not access any water') )
PiN_livelihood = ( PiN_livelihood +
PiN_food = ( PiN_food +
0.25 * (hh_data.iloc[:,letter2num('UJ')].values == 'Yes, but did not access amount of water needed') )
PiN_livelihood[HH_no_interview] = np.nan
print('PiN_livelihood', PiN_livelihood)
print(np.nansum(PiN_livelihood))
PiN_food[HH_no_interview] = np.nan
print('PiN_food', PiN_food)
print(np.nansum(PiN_food))
# =================== EARLY RECOVERY AND LIVELIHOODS ============================
# HH income has decreased in the previous 3 months
PiN_recovery = 2. * (hh_data.iloc[:,letter2num('TW')].values == 'Decrease')
PiN_livelihood = 2. * (hh_data.iloc[:,letter2num('TW')].values == 'Decrease')
# HH reports being in debt
PiN_recovery = ( PiN_recovery +
PiN_livelihood = ( PiN_livelihood +
2. * (hh_data.iloc[:,letter2num('TX')].values == 1.) ) # '1' = 'Yes'
# HH reports using “crisis” or “emergency” coping strategies
PiN_recovery = ( PiN_recovery +
PiN_livelihood = ( PiN_livelihood +
3. * ( (hh_data.iloc[:,letter2num('RO'):(letter2num('RX')+1)].values == 'Yes').any(axis=1) ) )
# HH reports no access to physical cash
PiN_recovery = ( PiN_recovery +
PiN_livelihood = ( PiN_livelihood +
3. * (hh_data.iloc[:,letter2num('TY')].values == 'No access to cash') )
PiN_recovery[HH_no_interview] = np.nan
print('PiN_recovery', PiN_recovery)
print(np.nansum(PiN_recovery))
PiN_livelihood[HH_no_interview] = np.nan
print('PiN_livelihood', PiN_livelihood)
print(np.nansum(PiN_livelihood))
# ====================== NUTRITION & individual-related EDUCATION/HEALTH =========================
......@@ -271,10 +267,10 @@ def calculate_PiN(data=None, source_dir="../../data/raw"):
# Household has children that have never attended any formal school
# HH has child/ren without any immunization
PiN_nutrition = np.zeros(PiN_recovery.shape)
PiN_school_attending = np.zeros(PiN_recovery.shape)
PiN_school_never_attended = np.zeros(PiN_recovery.shape)
PiN_vaccination = np.zeros(PiN_recovery.shape)
PiN_nutrition = np.zeros(PiN_livelihood.shape)
PiN_school_attending = np.zeros(PiN_livelihood.shape)
PiN_school_never_attended = np.zeros(PiN_livelihood.shape)
PiN_vaccination = np.zeros(PiN_livelihood.shape)
# Fetch info about the individual memeber in the HH
for iCol in range(len(PiN_nutrition)) :
......@@ -408,15 +404,15 @@ def calculate_PiN(data=None, source_dir="../../data/raw"):
PiN_array = np.stack((hh_data.loc[:, '_parent_index'],
PiN_wash,
PiN_shelter_NFI,
PiN_food,
PiN_livelihood,
PiN_recovery,
PiN_health,
PiN_nutrition,
PiN_education,
PiN_protection), axis=1)
df_PiN = pd.DataFrame( PiN_array,
columns=['_parent_index','Wash', 'Shelter', 'Livelihood', 'Recovery', 'Health',
columns=['_parent_index','Wash', 'Shelter', 'Food', 'Livelihood', 'Health',
'Nutrition', 'Education', 'Protection'] )
# Calculate the total PiN, if some sector-specific PiN are nan the result will be nan
......
......@@ -197,7 +197,7 @@ class clean_table:
return
def geographical_PiN(self, human):
colnames_ward = ['Ward', 'Wash', 'Shelter', 'Livelihood', 'Recovery', 'Health', 'Nutrition', 'Education',
colnames_ward = ['Ward', 'Wash', 'Shelter', 'Food', 'Livelihood', 'Health', 'Nutrition', 'Education',
'Protection', 'Overall']
wards = self.geo_helper(human, colnames_ward)
......@@ -206,7 +206,7 @@ class clean_table:
colnames_ward[0] = "Ward"
wards.columns = colnames_ward
colnames_village = ['Village', 'Wash', 'Shelter', 'Livelihood', 'Recovery', 'Health', 'Nutrition', 'Education',
colnames_village = ['Village', 'Wash', 'Shelter', 'Food', 'Livelihood', 'Health', 'Nutrition', 'Education',
'Protection', 'Overall']
village = self.geo_helper(human, colnames_village)
......
......@@ -5,12 +5,14 @@ import os
import time
import pickle
import numpy as np
import pandas as pd
import sklearn.ensemble
import sklearn.preprocessing
import sklearn.model_selection
import src.cleanup.cleanup as clp
import src.utils.experiment_report as experiment_report
import src.utils.evaluation_statistics as statistics
class RandomForest:
......@@ -45,10 +47,10 @@ def loadData(feature_target_list, nr_target_variables, categorial_list, need_thr
in_directory=os.path.join('data', 'raw'),
out_directory=os.path.join('data', 'processed'),
load_tif_files=True)
data_input = clean.table
data_input = data_input.drop_duplicates(subset='HouseholdID', keep='first').dropna(subset=['RespondentSex'])
data_input = data_input.dropna(subset=['literacy'])
data_input = data_input[feature_target_list]
clean_data_input = clean.table
clean_data_input = clean_data_input.drop_duplicates(subset='HouseholdID', keep='first').dropna(subset=['RespondentSex'])
clean_data_input = clean_data_input.dropna(subset=['literacy'])
data_input = clean_data_input[feature_target_list]
for key in categorial_list:
data_input[key] = data_input[key].astype("category").cat.codes
......@@ -61,7 +63,7 @@ def loadData(feature_target_list, nr_target_variables, categorial_list, need_thr
one_hot_data[one_hot_data[:, -target] < need_threshold, -target] = 0
one_hot_data[one_hot_data[:, -target] >= need_threshold, -target] = 1
return one_hot_data
return one_hot_data, clean_data_input
def runExperiment(X_train, X_test, y_train, y_test, list_features, target_variable):
......@@ -104,7 +106,7 @@ def runExperiment(X_train, X_test, y_train, y_test, list_features, target_variab
def runMultipleExeriments(feature_list, target_prediction, categorial_list, need_threshold):
"""Runs multiple experiments for the specified target_predictions and saves the result to a .csv file"""
# ---- Load and Split Data ----
data_input = loadData(feature_list + target_prediction, len(target_prediction), categorial_list, need_threshold)
data_input, _ = loadData(feature_list + target_prediction, len(target_prediction), categorial_list, need_threshold)
random_state = 42
nr_target_predictions = len(target_prediction)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data_input[:, :-nr_target_predictions],
......@@ -113,6 +115,18 @@ def runMultipleExeriments(feature_list, target_prediction, categorial_list, need
random_state=random_state,
shuffle=True)
# Create dummy dataframe
# columns = ['parent_index', 'Wash', 'Shelter', 'Livelihood', 'Recovery', 'Health', 'Nutrition', 'Education',
# 'Protection', 'Wash_in_need', 'Shelter_in_need', 'Livelihood_in_need', 'Recovery_in_need',
# 'Health_in_need', 'Nutrition_in_need', 'Education_in_need', 'Protection_in_need', 'Wash_pred',
# 'Shelter_pred', 'Livelihood_pred', 'Recovery_pred', 'Health_pred', 'Nutrition_pred', 'Education_pred',
# 'Protection_pred']
# input_dataframe = pd.DataFrame(np.random.randint(0, 2, size=(100, len(columns))), columns=columns)
# out_dir = 'results/final_evaluation'
# statistics.createStats(input_dataframe, out_dir=out_dir, model='random_forest')
score_list = []
for target in range(nr_target_predictions):
score = runExperiment(X_train, X_test, y_train[:, target], y_test[:, target],
......@@ -120,6 +134,14 @@ def runMultipleExeriments(feature_list, target_prediction, categorial_list, need
score_list.append('{0:.4f}'.format(score))
# def runPredictionKFold(feature_list, target_prediction, categorial_list, need_threshold):
# feature_list = feature_list
# data_input, clean_data = loadData(feature_list + target_prediction, len(target_prediction), categorial_list,
# need_threshold)
# print(clean_data)
# print(data_input[:10, 0])
def main():
# Current Features:
# ['PopulationGroup', 'Status', 'RespondentSex', 'HeadOfHouseholdSex',
......@@ -128,15 +150,15 @@ def main():
# 'HouseholdID', 'priority1', 'priority2', 'priority3', 'clusterName',
# 'ShortID', 'Ward', 'Village', 'memberSex', 'memberAgeYears',
# 'memberAgeMonths', 'Camp', 'longitude', 'latitude', 'HH',
# '_parent_index', 'Wash', 'Shelter', 'Livelihood', 'Recovery', 'Health',
# '_parent_index', 'Wash', 'Shelter', 'Food', 'Livelihood', 'Health',
# 'Nutrition', 'Education', 'Protection', 'Overall', 'Ward_mean_Wash',
# 'Ward_mean_Shelter', 'Ward_mean_Livelihood', 'Ward_mean_Recovery',
# 'Ward_mean_Shelter', 'Ward_mean_Food', 'Ward_mean_Livelihood',
# 'Ward_mean_Health', 'Ward_mean_Nutrition', 'Ward_mean_Education',
# 'Ward_mean_Protection', 'Ward_mean_Overall', 'Village_mean_Wash',
# 'Village_mean_Shelter', 'Village_mean_Livelihood',
# 'Village_mean_Recovery', 'Village_mean_Health',
# 'Village_mean_Nutrition', 'Village_mean_Education',
# 'Village_mean_Protection', 'Village_mean_Overall']
# 'Village_mean_Shelter', 'Village_mean_Food', 'Village_mean_Livelihood',
# 'Village_mean_Health', 'Village_mean_Nutrition',
# 'Village_mean_Education', 'Village_mean_Protection',
# 'Village_mean_Overall', 'poverty', 'literacy', 'population']
# feature_list = ['PopulationGroup', 'Status', 'RespondentSex', 'HeadOfHouseholdSex', 'HeadofHouseholdAge',
# 'HeadOfHouseholdMaritalStatus', 'TribeOfHousehold', 'NoOfHouseholdMembers', 'HH']
......@@ -146,14 +168,15 @@ def main():
'HeadOfHouseholdMaritalStatus', 'TribeOfHousehold', 'NoOfHouseholdMembers', 'poverty', 'population',
'literacy']
target_prediction = ['Wash', 'Shelter', 'Livelihood', 'Recovery', 'Health', 'Nutrition',
'Education', 'Protection']
target_prediction = ['Wash', 'Shelter', 'Food', 'Livelihood', 'Health', 'Nutrition', 'Education', 'Protection']
categorial_list = ['PopulationGroup', 'Status', 'RespondentSex', 'HeadOfHouseholdSex',
'HeadOfHouseholdMaritalStatus', 'TribeOfHousehold']
need_threshold = 4
runMultipleExeriments(feature_list, target_prediction, categorial_list, need_threshold)
# runMultipleExeriments(feature_list, target_prediction, categorial_list, need_threshold)
runPredictionKFold(feature_list, target_prediction, categorial_list, need_threshold)
if __name__ == "__main__":
......
import os
import time
import numpy as np
import sklearn.metrics
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
def createStats(input_dataframe, model, out_dir='results/final_evaluation'):
pin_sectors = ['Wash', 'Shelter', 'Livelihood', 'Recovery', 'Health', 'Nutrition', 'Education', 'Protection']
low_threshold = 2
high_threshold = 6
pdf_file_name = os.path.join(out_dir, model + '_' + time.strftime("%Y%m%d-%H%M%S"))
pdf = PdfPages(pdf_file_name)
for sector in pin_sectors:
print('Calculating score for ' + sector)
key_in_need = sector + '_in_need'
key_pred = sector + '_pred'
data_gt = input_dataframe[key_in_need]
data_pred = input_dataframe[key_pred]
plotConfusionF1(data_gt, data_pred, sector, pdf)
plotSeverityStatistics(input_dataframe[[sector, key_in_need, key_pred]], low_threshold, high_threshold, sector,
pdf)
pdf.close()
def plotConfusionF1(data_gt, data_pred, sector, pdf):
fig, ax = plt.subplots(figsize=(6.5, 1))
f1_score = sklearn.metrics.accuracy_score(y_true=data_gt, y_pred=data_pred)
table_data = [['Sector', sector],
['F1-Score', f1_score]]
ax.table(cellText=table_data, cellLoc='left', loc='center')
ax.axis('off')
pdf.savefig()
plt.close()
confusion_matrix = sklearn.metrics.confusion_matrix(y_true=data_gt, y_pred=data_pred)
plotConfusionMatrix(confusion_matrix, ['Not In Need', 'In Need'], pdf)
def plotSeverityStatistics(sector_data_frame, low_threshold, high_threshold, sector, pdf):
data_frame_adjusted = sector_data_frame[(sector_data_frame.iloc[:, 0] < low_threshold) |
(sector_data_frame.iloc[:, 0] > high_threshold)]
sector = sector + ' Household with a PiN [0, ' + str(low_threshold) + '] and [' + str(high_threshold) + ', 8]'
plotConfusionF1(data_frame_adjusted.iloc[:, 1], data_frame_adjusted.iloc[:, 2], sector, pdf)
def plotConfusionMatrix(confusion_matrix, classes, pdf, cmap=plt.cm.Blues):
"""
Plots a confusion Matrix.
Taken from "https://scikit-learn.org/stable/auto_examples/model_selection/
plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py"
"""
confusion_matrix = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots()
im = ax.imshow(confusion_matrix, interpolation='nearest', cmap=cmap, vmin=0, vmax=1)
ax.figure.colorbar(im, ax=ax)
ax.set(xticks=np.arange(confusion_matrix.shape[1]),
yticks=np.arange(confusion_matrix.shape[0]),
xticklabels=classes, yticklabels=classes,
ylabel='True Label',
xlabel='Predicted Label')
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
thresh = 0.85
for i in range(confusion_matrix.shape[0]):
for j in range(confusion_matrix.shape[1]):
ax.text(j, i, format(confusion_matrix[i, j], '.2f'),
ha="center", va="center",
color="white" if confusion_matrix[i, j] > thresh else "black")
fig.tight_layout()
pdf.savefig()
plt.close()
return ax
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment