We merge water point data with conflict and demographic data at the sub-county level. We end up with more than 100k water points and information on whether it is not functioning or not as well as all the regional information described above.

Master Dataframe

We will be combining our water, conflict and demographic data into one master dataframe to be used for analysis and modelling.

%run /Users/thomasadler/Desktop/futuristic-platipus/capstone/notebooks/ta_01_packages_functions.py

Loading datasets

water_df=pd.read_csv(data_filepath + 'ta_1_uganda_water_df_clean.csv', index_col=0)

#check
water_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108906 entries, 0 to 108905
Data columns (total 35 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   row_id                     108906 non-null  int64  
 1   source                     108906 non-null  object 
 2   lat_deg                    108906 non-null  float64
 3   lon_deg                    108906 non-null  float64
 4   report_date                108906 non-null  object 
 5   status_id                  108906 non-null  int64  
 6   facility_type              108906 non-null  object 
 7   clean_country_name         108906 non-null  object 
 8   clean_adm1                 108906 non-null  object 
 9   clean_adm2                 108906 non-null  object 
 10  clean_adm3                 108906 non-null  object 
 11  clean_adm4                 108906 non-null  object 
 12  distance_to_primary        108906 non-null  float64
 13  distance_to_secondary      108906 non-null  float64
 14  distance_to_tertiary       108906 non-null  float64
 15  distance_to_city           108906 non-null  float64
 16  distance_to_town           108906 non-null  float64
 17  usage_cap                  108906 non-null  float64
 18  staleness_score            108906 non-null  float64
 19  is_latest                  108906 non-null  int64  
 20  location_id                108906 non-null  int64  
 21  cluster_size               108906 non-null  float64
 22  new_georeferenced_column_  108906 non-null  object 
 23  lat_lon_deg                108906 non-null  object 
 24  count                      108906 non-null  float64
 25  water_source_clean         91467 non-null   object 
 26  water_source_category      91467 non-null   object 
 27  wpdx_id                    107814 non-null  object 
 28  served_population          108906 non-null  float64
 29  local_population           108906 non-null  float64
 30  crucialness                108906 non-null  float64
 31  pressure                   108906 non-null  float64
 32  install_year               89181 non-null   object 
 33  management_clean           92281 non-null   object 
 34  status_clean               89408 non-null   object 
dtypes: float64(15), int64(4), object(16)
memory usage: 29.9+ MB

conflict_df=pd.read_csv(data_filepath +'ta_3_conflict_df_clean.csv', index_col=0)

#check
conflict_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7854 entries, 0 to 7853
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   data_id         7854 non-null   int64  
 1   event_date      7854 non-null   object 
 2   event_type      7854 non-null   object 
 3   sub_event_type  7854 non-null   object 
 4   actor1          7854 non-null   object 
 5   assoc_actor_1   1963 non-null   object 
 6   inter1          7854 non-null   int64  
 7   actor2          6605 non-null   object 
 8   assoc_actor_2   1669 non-null   object 
 9   inter2          7854 non-null   int64  
 10  interaction     7854 non-null   int64  
 11  clean_adm1      7854 non-null   object 
 12  clean_adm2      7854 non-null   object 
 13  clean_adm3      7854 non-null   object 
 14  clean_adm4      7854 non-null   object 
 15  latitude        7854 non-null   float64
 16  longitude       7854 non-null   float64
 17  source          7854 non-null   object 
 18  source_scale    7854 non-null   object 
 19  notes           7772 non-null   object 
 20  fatalities      7854 non-null   float64
dtypes: float64(3), int64(4), object(14)
memory usage: 1.3+ MB

demographic_df=pd.read_csv(data_filepath +'ta_2_subcounty_demographic_clean.csv', index_col=0)

#check
demographic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1382 entries, 0 to 1381
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   clean_adm4                   1382 non-null   object 
 1   perc_hh_head_male            1382 non-null   float64
 2   perc_pop612_primary          1382 non-null   float64
 3   perc_pop1318_secondary       1382 non-null   float64
 4   perc_pop18p_illiterate       1382 non-null   float64
 5   perc_pop017_certificate      1382 non-null   float64
 6   perc_pop017_both_parents     1382 non-null   float64
 7   perc_pop2p_disability        1382 non-null   float64
 8   perc_pop1017_married         1382 non-null   float64
 9   perc_pop1217_birth           1382 non-null   float64
 10  perc_pop1464_working         1382 non-null   float64
 11  perc_pop10p_mobile_phone     1382 non-null   float64
 12  perc_hh_temp_dwelling        1382 non-null   float64
 13  perc_pop_5km_dist_primary    1382 non-null   float64
 14  perc_pop_5km_dist_secondary  1382 non-null   float64
 15  perc_pop_5km_dist_health     1382 non-null   float64
 16  perc_pop_5km_dist_police     1382 non-null   float64
 17  perc_hh_mosquito_net         1382 non-null   float64
 18  perc_hh_piped_water          1382 non-null   float64
 19  perc_hh_borehole             1382 non-null   float64
 20  perc_hh_toilet               1382 non-null   float64
 21  perc_hh_own_house            1382 non-null   float64
 22  perc_hh_own_tv               1382 non-null   float64
 23  perc_hh_bank_acc             1382 non-null   float64
 24  perc_hh_subs_farm            1382 non-null   float64
 25  perc_hh_less2meals           1382 non-null   float64
 26  perc_hh_electricity          1382 non-null   float64
 27  tot_pop_subcounty            1382 non-null   float64
dtypes: float64(27), object(1)
memory usage: 313.1+ KB

Merge datasets

We want to get demographic information for each water point at its most local level.

master_df=pd.merge(water_df, demographic_df, how='left',\
                               left_on=water_df['clean_adm4'],\
                               right_on=demographic_df['clean_adm4'], suffixes=['x', 'y'])

#check
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108906 entries, 0 to 108905
Data columns (total 64 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   key_0                        108906 non-null  object 
 1   row_id                       108906 non-null  int64  
 2   source                       108906 non-null  object 
 3   lat_deg                      108906 non-null  float64
 4   lon_deg                      108906 non-null  float64
 5   report_date                  108906 non-null  object 
 6   status_id                    108906 non-null  int64  
 7   facility_type                108906 non-null  object 
 8   clean_country_name           108906 non-null  object 
 9   clean_adm1                   108906 non-null  object 
 10  clean_adm2                   108906 non-null  object 
 11  clean_adm3                   108906 non-null  object 
 12  clean_adm4x                  108906 non-null  object 
 13  distance_to_primary          108906 non-null  float64
 14  distance_to_secondary        108906 non-null  float64
 15  distance_to_tertiary         108906 non-null  float64
 16  distance_to_city             108906 non-null  float64
 17  distance_to_town             108906 non-null  float64
 18  usage_cap                    108906 non-null  float64
 19  staleness_score              108906 non-null  float64
 20  is_latest                    108906 non-null  int64  
 21  location_id                  108906 non-null  int64  
 22  cluster_size                 108906 non-null  float64
 23  new_georeferenced_column_    108906 non-null  object 
 24  lat_lon_deg                  108906 non-null  object 
 25  count                        108906 non-null  float64
 26  water_source_clean           91467 non-null   object 
 27  water_source_category        91467 non-null   object 
 28  wpdx_id                      107814 non-null  object 
 29  served_population            108906 non-null  float64
 30  local_population             108906 non-null  float64
 31  crucialness                  108906 non-null  float64
 32  pressure                     108906 non-null  float64
 33  install_year                 89181 non-null   object 
 34  management_clean             92281 non-null   object 
 35  status_clean                 89408 non-null   object 
 36  clean_adm4y                  104624 non-null  object 
 37  perc_hh_head_male            104624 non-null  float64
 38  perc_pop612_primary          104624 non-null  float64
 39  perc_pop1318_secondary       104624 non-null  float64
 40  perc_pop18p_illiterate       104624 non-null  float64
 41  perc_pop017_certificate      104624 non-null  float64
 42  perc_pop017_both_parents     104624 non-null  float64
 43  perc_pop2p_disability        104624 non-null  float64
 44  perc_pop1017_married         104624 non-null  float64
 45  perc_pop1217_birth           104624 non-null  float64
 46  perc_pop1464_working         104624 non-null  float64
 47  perc_pop10p_mobile_phone     104624 non-null  float64
 48  perc_hh_temp_dwelling        104624 non-null  float64
 49  perc_pop_5km_dist_primary    104624 non-null  float64
 50  perc_pop_5km_dist_secondary  104624 non-null  float64
 51  perc_pop_5km_dist_health     104624 non-null  float64
 52  perc_pop_5km_dist_police     104624 non-null  float64
 53  perc_hh_mosquito_net         104624 non-null  float64
 54  perc_hh_piped_water          104624 non-null  float64
 55  perc_hh_borehole             104624 non-null  float64
 56  perc_hh_toilet               104624 non-null  float64
 57  perc_hh_own_house            104624 non-null  float64
 58  perc_hh_own_tv               104624 non-null  float64
 59  perc_hh_bank_acc             104624 non-null  float64
 60  perc_hh_subs_farm            104624 non-null  float64
 61  perc_hh_less2meals           104624 non-null  float64
 62  perc_hh_electricity          104624 non-null  float64
 63  tot_pop_subcounty            104624 non-null  float64
dtypes: float64(42), int64(4), object(18)
memory usage: 54.0+ MB

We then want to to know how violent and unstable a region is. We sum up the total number of fatalities and events/conflicts a region has had since 1997 and merge them with our water point and demographic data.

conflict_fatal_grouped=conflict_df[['clean_adm4', 'fatalities']].groupby('clean_adm4').sum()
conflict_fatal_grouped.columns=['total_fatalities_adm4']
conflict_fatal_grouped.reset_index(inplace=True)

#check
conflict_fatal_grouped

conflict_events_grouped=conflict_df[['clean_adm4', 'data_id']].groupby(['clean_adm4']).count()
conflict_events_grouped.columns=['total_events_adm4']
conflict_events_grouped.reset_index(inplace=True)

#check
conflict_events_grouped

del master_df[master_df.columns[0]]

#merge water dataset with fatalities dataset
master_df=pd.merge(master_df, conflict_fatal_grouped, how='left',\
                               left_on=master_df['clean_adm4x'],\
                               right_on=conflict_fatal_grouped['clean_adm4'],suffixes=('z', 'w'))

#check
master_df.head()

del master_df[master_df.columns[0]]

#merge water dataset with fatalities dataset
master_df=pd.merge(master_df, conflict_events_grouped, how='left',\
                               left_on=master_df['clean_adm4x'],\
                               right_on=conflict_events_grouped['clean_adm4'], suffixes=('a', 'b'))

#check
master_df.head()

master_df.columns

Index(['key_0', 'row_id', 'source', 'lat_deg', 'lon_deg', 'report_date',
       'status_id', 'facility_type', 'clean_country_name', 'clean_adm1',
       'clean_adm2', 'clean_adm3', 'clean_adm4x', 'distance_to_primary',
       'distance_to_secondary', 'distance_to_tertiary', 'distance_to_city',
       'distance_to_town', 'usage_cap', 'staleness_score', 'is_latest',
       'location_id', 'cluster_size', 'new_georeferenced_column_',
       'lat_lon_deg', 'count', 'water_source_clean', 'water_source_category',
       'wpdx_id', 'served_population', 'local_population', 'crucialness',
       'pressure', 'install_year', 'management_clean', 'status_clean',
       'clean_adm4y', 'perc_hh_head_male', 'perc_pop612_primary',
       'perc_pop1318_secondary', 'perc_pop18p_illiterate',
       'perc_pop017_certificate', 'perc_pop017_both_parents',
       'perc_pop2p_disability', 'perc_pop1017_married', 'perc_pop1217_birth',
       'perc_pop1464_working', 'perc_pop10p_mobile_phone',
       'perc_hh_temp_dwelling', 'perc_pop_5km_dist_primary',
       'perc_pop_5km_dist_secondary', 'perc_pop_5km_dist_health',
       'perc_pop_5km_dist_police', 'perc_hh_mosquito_net',
       'perc_hh_piped_water', 'perc_hh_borehole', 'perc_hh_toilet',
       'perc_hh_own_house', 'perc_hh_own_tv', 'perc_hh_bank_acc',
       'perc_hh_subs_farm', 'perc_hh_less2meals', 'perc_hh_electricity',
       'tot_pop_subcounty', 'clean_adm4a', 'total_fatalities_adm4',
       'clean_adm4b', 'total_events_adm4'],
      dtype='object')

We choose columns which may potentially be able to help us predict whether a water point is functioning or not.

master_df_clean = master_df[['wpdx_id',
    'lat_deg', 'lon_deg', 'status_id',
    'clean_adm1', 'clean_adm2', 'clean_adm3', 'clean_adm4x', 'distance_to_primary',
    'distance_to_secondary', 'distance_to_tertiary', 'distance_to_city',
    'distance_to_town', 'usage_cap', 'staleness_score', 'cluster_size', 'water_source_clean','install_year',
    'management_clean', 'served_population', 'local_population', 'crucialness', 'pressure',
    'perc_hh_head_male', 'perc_pop612_primary', 'perc_pop1318_secondary',
    'perc_pop18p_illiterate', 'perc_pop017_certificate',
    'perc_pop017_both_parents', 'perc_pop2p_disability', 'perc_pop1017_married',
    'perc_pop1217_birth', 'perc_pop1464_working', 'perc_pop10p_mobile_phone',
    'perc_hh_temp_dwelling', 'perc_hh_mosquito_net', 'perc_hh_piped_water',
    'perc_hh_borehole', 'perc_hh_toilet', 'perc_hh_own_house',
    'perc_hh_own_tv', 'perc_hh_bank_acc', 'perc_hh_subs_farm',
    'perc_hh_less2meals', 'perc_hh_electricity', 'total_fatalities_adm4',
    'total_events_adm4'
]]

#check
master_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108906 entries, 0 to 108905
Data columns (total 47 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   wpdx_id                   107814 non-null  object 
 1   lat_deg                   108906 non-null  float64
 2   lon_deg                   108906 non-null  float64
 3   status_id                 108906 non-null  int64  
 4   clean_adm1                108906 non-null  object 
 5   clean_adm2                108906 non-null  object 
 6   clean_adm3                108906 non-null  object 
 7   clean_adm4x               108906 non-null  object 
 8   distance_to_primary       108906 non-null  float64
 9   distance_to_secondary     108906 non-null  float64
 10  distance_to_tertiary      108906 non-null  float64
 11  distance_to_city          108906 non-null  float64
 12  distance_to_town          108906 non-null  float64
 13  usage_cap                 108906 non-null  float64
 14  staleness_score           108906 non-null  float64
 15  cluster_size              108906 non-null  float64
 16  water_source_clean        91467 non-null   object 
 17  install_year              89181 non-null   object 
 18  management_clean          92281 non-null   object 
 19  served_population         108906 non-null  float64
 20  local_population          108906 non-null  float64
 21  crucialness               108906 non-null  float64
 22  pressure                  108906 non-null  float64
 23  perc_hh_head_male         104624 non-null  float64
 24  perc_pop612_primary       104624 non-null  float64
 25  perc_pop1318_secondary    104624 non-null  float64
 26  perc_pop18p_illiterate    104624 non-null  float64
 27  perc_pop017_certificate   104624 non-null  float64
 28  perc_pop017_both_parents  104624 non-null  float64
 29  perc_pop2p_disability     104624 non-null  float64
 30  perc_pop1017_married      104624 non-null  float64
 31  perc_pop1217_birth        104624 non-null  float64
 32  perc_pop1464_working      104624 non-null  float64
 33  perc_pop10p_mobile_phone  104624 non-null  float64
 34  perc_hh_temp_dwelling     104624 non-null  float64
 35  perc_hh_mosquito_net      104624 non-null  float64
 36  perc_hh_piped_water       104624 non-null  float64
 37  perc_hh_borehole          104624 non-null  float64
 38  perc_hh_toilet            104624 non-null  float64
 39  perc_hh_own_house         104624 non-null  float64
 40  perc_hh_own_tv            104624 non-null  float64
 41  perc_hh_bank_acc          104624 non-null  float64
 42  perc_hh_subs_farm         104624 non-null  float64
 43  perc_hh_less2meals        104624 non-null  float64
 44  perc_hh_electricity       104624 non-null  float64
 45  total_fatalities_adm4     35033 non-null   float64
 46  total_events_adm4         35033 non-null   float64
dtypes: float64(38), int64(1), object(8)
memory usage: 39.9+ MB

Clean data

print(f'There are {master_df_clean.isna().sum().sum()} null values')

There are 296831 null values

master_df_clean['clean_adm1'].value_counts()

Western     41030
Eastern     25181
Northern    25052
Central     17643
Name: clean_adm1, dtype: int64

master_df_clean[['distance_to_primary', 'distance_to_secondary', 'distance_to_tertiary', 'distance_to_city']].describe().T

The distances are evidently in metres, as we know that the area of Uganda is 241,000 km $^2$. So we want transform these columns into kilometres, to make our analysis and interpretation easier.

distances=['distance_to_primary', 'distance_to_secondary', 'distance_to_tertiary', 'distance_to_city', 'distance_to_town']

#transform into kilometres
master_df_clean[distances]=master_df_clean[distances]/1000

#check
master_df_clean[['distance_to_primary', 'distance_to_secondary', 'distance_to_tertiary', 'distance_to_city']].describe().T

master_df_clean_numerical = master_df_clean[
    master_df_clean.select_dtypes(exclude='object').columns]

# check
master_df_clean_numerical.head()

null_cols = master_df_clean_numerical.isna().sum()
cols_to_impute = null_cols[null_cols > 0].index
cols_to_impute = cols_to_impute.drop(['total_events_adm4','total_fatalities_adm4'])

# check
cols_to_impute

Index(['perc_hh_head_male', 'perc_pop612_primary', 'perc_pop1318_secondary',
       'perc_pop18p_illiterate', 'perc_pop017_certificate',
       'perc_pop017_both_parents', 'perc_pop2p_disability',
       'perc_pop1017_married', 'perc_pop1217_birth', 'perc_pop1464_working',
       'perc_pop10p_mobile_phone', 'perc_hh_temp_dwelling',
       'perc_hh_mosquito_net', 'perc_hh_piped_water', 'perc_hh_borehole',
       'perc_hh_toilet', 'perc_hh_own_house', 'perc_hh_own_tv',
       'perc_hh_bank_acc', 'perc_hh_subs_farm', 'perc_hh_less2meals',
       'perc_hh_electricity'],
      dtype='object')

We have missing values because not all regions from our water point dataset match the official administrative regions. Water points for which the region is not recognisable ends up with a missing value. Here, we will first fill the null values for the demographic data with the median of its lowest level region we can attribute it to. We start with adm4.

for col in cols_to_impute:
    plt.figure()
    sns.distplot(master_df_clean_numerical[col])
    plt.axvline(master_df_clean_numerical[col].mean(), c='r', label='mean')
    plt.axvline(master_df_clean_numerical[col].median(), c='gold', label='median')
    plt.legend()
    plt.title(f'Density plot of {col}')
    plt.show()

These graphs show us that the median is a better alternative to mean as it is more representative of our dataset. There are more observations which are closer to the median value than to the average value because the latter is more sensitive to outliers.

for col in cols_to_impute:
    missing_to_regional_median(master_df_clean, col, 'clean_adm4x')

#checking remaining null values
master_df_clean.isna().sum().sum()

296831

Since we still have missing values we fill missing values with the median of next lowest administrative leel, adm3. We go on until we do not have any more missing values.

for col in cols_to_impute:
    missing_to_regional_median(master_df_clean, col, 'clean_adm3')

#checking remaining null values
master_df_clean.isna().sum().sum()

208721

for col in cols_to_impute:
    missing_to_regional_median(master_df_clean, col, 'clean_adm2')

#checking remaining null values
master_df_clean.isna().sum().sum()

202627

for col in cols_to_impute:
    missing_to_regional_median(master_df_clean, col, 'clean_adm1')

#checking remaining null values
master_df_clean.isna().sum().sum()

202627

We end up inputting the country's median for the few remaining missing values.

for col in cols_to_impute:
    master_df_clean[col] = master_df_clean[col].fillna(
        master_df_clean[col].median())

# check remaining null values
master_df_clean.isna().sum().sum()

202627

Missing values for fatalities/events essentially mean there were no conflicts or fatalities for that region since 1997. We assume that the conflict dataset we accessed was extensive and included all conflicts since 1997.

master_df_clean.isna().sum()
for col in ['total_fatalities_adm4','total_events_adm4']:
    master_df_clean[col] = master_df_clean[col].fillna(0)

# check remaining null values
master_df_clean.isna().sum().sum()

54881

date_converter(master_df_clean, 'install_year')

#converting to year
master_df_clean['install_year']=master_df_clean['install_year'].dt.year

#visualising distribution of installation year
plt.figure()
sns.histplot(master_df_clean['install_year'])
plt.axvline(master_df_clean['install_year'].mean(), c='r', label='mean')
plt.axvline(master_df_clean['install_year'].median(), c='gold', label='median')
plt.legend()
plt.title(f'Density plot for installation year')
plt.show()

We choose to create a binary column for the installation year. All water points constructed after 2006 are encoded with a 1, and all before are encoded 0. We choose 2006 because it was the first multi-party election in Uganda for 25 years. We consider this a an important enough turning point to differentiate between water points. As we see on the distribution above, we have a healthy balance of constructions before and after 2006. In addition, water points with missing installation year information are encoded with 0. We assume that if the installation year was unknown, it was probably because it was old (when bookeeping and official records were not as good). We would expect to see that water points constructed after 2006 might be of better quality due to a more democratic government and better technology.

Ideally we would look at how our binarisation impacts the correlation with our outcome variable. However, because of missing values, the install_year and status_id are of different lengths, making that analysis not possible.

master_df_clean['install_year'] = np.where(
    master_df_clean['install_year'] >= 2006, 1, 0)

#fill unknown years to 0
master_df_clean['install_year'] = master_df_clean['install_year'].fillna(0)


#check
master_df_clean['install_year'].value_counts()

0    70322
1    38584
Name: install_year, dtype: int64

We consider entities from the government, public institutionsa to be a form of "public" management. We also include community management as we assume they are, in some way, related to local governments or governance structures. We also assume that they have similar goals and incentives to the more formal government institutions. Religious, health and medical facilities follow the same reasoning. It is interesting to see whether public or private management structures aremore likely to keep a water point functioning: although their goals are different, their incentives might still make them want to keep a water point working.

public_management=['Other Institutional Management', 'Direct Government Operation', 'Community Management',
'School Management','Religious Institution','Health Care Facility' ]

#dummy column into binary, public(1) or not public(0)
master_df_clean['management_clean'] = np.where(
    master_df_clean['management_clean'].isin(public_management), 1, 0)

#check
master_df_clean['management_clean'].value_counts()

1    81283
0    27623
Name: management_clean, dtype: int64

master_df_clean['water_source_clean'].value_counts()

Borehole                   40491
Protected Spring           28959
Rainwater Harvesting       16406
Protected Shallow Well      3137
Protected Well              1734
Piped Water                  285
Undefined Shallow Well       207
Undefined Spring             161
Delivered Water               85
Sand or Sub-surface Dam        1
Packaged Water                 1
Name: water_source_clean, dtype: int64

We create a binary column identifying whether the water point technology is complex or not. We use analysis made by Ravi & Rogger, 2021 to make this decision.

complex_tech=['Borehole', 'Piped Water', 'Sand or Sub-surface Dam', 'Packaged Water', 'Delivered Water']
 
#dummy column into binary, complex techology(1) or not not(0)
master_df_clean['water_source_clean'] = np.where(
    master_df_clean['water_source_clean'].isin(complex_tech), 1, 0)

#check
master_df_clean['water_source_clean'].value_counts()

0    68043
1    40863
Name: water_source_clean, dtype: int64

master_df_clean.isna().sum().sum()

1092

master_df_clean.columns

Index(['wpdx_id', 'lat_deg', 'lon_deg', 'status_id', 'clean_adm1',
       'clean_adm2', 'clean_adm3', 'clean_adm4x', 'distance_to_primary',
       'distance_to_secondary', 'distance_to_tertiary', 'distance_to_city',
       'distance_to_town', 'usage_cap', 'staleness_score', 'cluster_size',
       'water_source_clean', 'install_year', 'management_clean',
       'served_population', 'local_population', 'crucialness', 'pressure',
       'perc_hh_head_male', 'perc_pop612_primary', 'perc_pop1318_secondary',
       'perc_pop18p_illiterate', 'perc_pop017_certificate',
       'perc_pop017_both_parents', 'perc_pop2p_disability',
       'perc_pop1017_married', 'perc_pop1217_birth', 'perc_pop1464_working',
       'perc_pop10p_mobile_phone', 'perc_hh_temp_dwelling',
       'perc_hh_mosquito_net', 'perc_hh_piped_water', 'perc_hh_borehole',
       'perc_hh_toilet', 'perc_hh_own_house', 'perc_hh_own_tv',
       'perc_hh_bank_acc', 'perc_hh_subs_farm', 'perc_hh_less2meals',
       'perc_hh_electricity', 'total_fatalities_adm4', 'total_events_adm4'],
      dtype='object')

master_df_clean.rename(columns = {'status_id':'is_functioning', 'install_year':'is_installed_after_2006', 'water_source_clean':'is_complex_tech', 'management_clean':'is_public_management','clean_adm4x':'clean_adm4' }, inplace = True)

We want to calculate what proportion of the local population is being served by the water point.

master_df_clean['perc_local_served']=(master_df_clean['served_population']/master_df_clean['local_population'])*100

#check
master_df_clean[['served_population', 'local_population', 'perc_local_served']].head()

master_df_clean['perc_local_served'].isna().sum()

5332

These are null values because the water point is essentially not servicing any of the local population. We can safely fill those null values with 0.

master_df_clean['perc_local_served']=master_df_clean['perc_local_served'].fillna(0)

#check
master_df_clean['perc_local_served'].isna().sum()

0

master_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108906 entries, 0 to 108905
Data columns (total 48 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   wpdx_id                   107814 non-null  object 
 1   lat_deg                   108906 non-null  float64
 2   lon_deg                   108906 non-null  float64
 3   is_functioning            108906 non-null  int64  
 4   clean_adm1                108906 non-null  object 
 5   clean_adm2                108906 non-null  object 
 6   clean_adm3                108906 non-null  object 
 7   clean_adm4                108906 non-null  object 
 8   distance_to_primary       108906 non-null  float64
 9   distance_to_secondary     108906 non-null  float64
 10  distance_to_tertiary      108906 non-null  float64
 11  distance_to_city          108906 non-null  float64
 12  distance_to_town          108906 non-null  float64
 13  usage_cap                 108906 non-null  float64
 14  staleness_score           108906 non-null  float64
 15  cluster_size              108906 non-null  float64
 16  is_complex_tech           108906 non-null  int64  
 17  is_installed_after_2006   108906 non-null  int64  
 18  is_public_management      108906 non-null  int64  
 19  served_population         108906 non-null  float64
 20  local_population          108906 non-null  float64
 21  crucialness               108906 non-null  float64
 22  pressure                  108906 non-null  float64
 23  perc_hh_head_male         108906 non-null  float64
 24  perc_pop612_primary       108906 non-null  float64
 25  perc_pop1318_secondary    108906 non-null  float64
 26  perc_pop18p_illiterate    108906 non-null  float64
 27  perc_pop017_certificate   108906 non-null  float64
 28  perc_pop017_both_parents  108906 non-null  float64
 29  perc_pop2p_disability     108906 non-null  float64
 30  perc_pop1017_married      108906 non-null  float64
 31  perc_pop1217_birth        108906 non-null  float64
 32  perc_pop1464_working      108906 non-null  float64
 33  perc_pop10p_mobile_phone  108906 non-null  float64
 34  perc_hh_temp_dwelling     108906 non-null  float64
 35  perc_hh_mosquito_net      108906 non-null  float64
 36  perc_hh_piped_water       108906 non-null  float64
 37  perc_hh_borehole          108906 non-null  float64
 38  perc_hh_toilet            108906 non-null  float64
 39  perc_hh_own_house         108906 non-null  float64
 40  perc_hh_own_tv            108906 non-null  float64
 41  perc_hh_bank_acc          108906 non-null  float64
 42  perc_hh_subs_farm         108906 non-null  float64
 43  perc_hh_less2meals        108906 non-null  float64
 44  perc_hh_electricity       108906 non-null  float64
 45  total_fatalities_adm4     108906 non-null  float64
 46  total_events_adm4         108906 non-null  float64
 47  perc_local_served         108906 non-null  float64
dtypes: float64(39), int64(4), object(5)
memory usage: 40.7+ MB

print(f"There are {master_df_clean.duplicated().sum()} duplicated rows.")

master_df_clean_final=master_df_clean.drop_duplicates()

print(f"There are now {master_df_clean_final.duplicated().sum()} duplicated rows.")

There are 1722 duplicated rows.
There are now 0 duplicated rows.

Our dataset is now ready for EDA, feature selection and modelling. We have no missing values, no duplicate rows, every column is interpretable and can be modelled.

master_df_clean_final.to_csv(data_filepath + 'ta_4_master_df.csv')

Image(dictionary_filepath+"4A-Master-Dictionary.png")

Image(dictionary_filepath+"4B-Master-Dictionary.png")

	clean_adm4	total_fatalities_adm4
0	Abala Parish	17.0
1	Abalodyang	5.0
2	Abanga	0.0
3	Abarilera	22.0
4	Abayita Ababiri	0.0
...	...	...
1275	Yumbe	6.0
1276	Zeu	7.0
1277	Zoka	1.0
1278	Zoka Forest	1.0
1279	Zombo	5.0

	clean_adm4	total_events_adm4
0	Abala Parish	9
1	Abalodyang	1
2	Abanga	1
3	Abarilera	7
4	Abayita Ababiri	1
...	...	...
1275	Yumbe	11
1276	Zeu	1
1277	Zoka	1
1278	Zoka Forest	3
1279	Zombo	1

	key_0	row_id	source	lat_deg	lon_deg	report_date	status_id	facility_type	clean_country_name	clean_adm1	...	perc_hh_toilet	perc_hh_own_house	perc_hh_own_tv	perc_hh_bank_acc	perc_hh_subs_farm	perc_hh_less2meals	perc_hh_electricity	tot_pop_subcounty	clean_adm4	total_fatalities_adm4
0	Kabambiro	652212	Water For People	0.158537	30.490643	2005-09-07	1	Improved	Uganda	Western	...	1.317831	87.942373	1.249238	10.259377	91.358551	4.212975	4.461754	15484.0	NaN	NaN
1	Nyabbani	653304	Water For People	0.070597	30.415651	2005-09-07	1	Improved	Uganda	Western	...	0.866692	88.482012	2.614604	14.573029	92.319897	4.458575	7.565426	21953.0	NaN	NaN
2	Kabambiro	655356	Water For People	0.158667	30.490551	2005-09-07	1	Improved	Uganda	Western	...	1.317831	87.942373	1.249238	10.259377	91.358551	4.212975	4.461754	15484.0	NaN	NaN
3	Bwizi	655725	Water For People	0.430480	30.751951	2005-09-07	0	Improved	Uganda	Western	...	2.031564	82.706261	1.033757	6.879652	95.096440	5.003880	8.693308	28764.0	NaN	NaN
4	Kicheche	661237	Water For People	-0.134680	30.351593	2005-09-07	1	Improved	Uganda	Western	...	0.983537	89.721699	4.054211	20.201299	87.522511	4.271490	9.606041	26338.0	NaN	NaN

	key_0	row_id	source	lat_deg	lon_deg	report_date	status_id	facility_type	clean_country_name	clean_adm1	...	perc_hh_own_tv	perc_hh_bank_acc	perc_hh_subs_farm	perc_hh_less2meals	perc_hh_electricity	tot_pop_subcounty	clean_adm4a	total_fatalities_adm4	clean_adm4b	total_events_adm4
0	Kabambiro	652212	Water For People	0.158537	30.490643	2005-09-07	1	Improved	Uganda	Western	...	1.249238	10.259377	91.358551	4.212975	4.461754	15484.0	NaN	NaN	NaN	NaN
1	Nyabbani	653304	Water For People	0.070597	30.415651	2005-09-07	1	Improved	Uganda	Western	...	2.614604	14.573029	92.319897	4.458575	7.565426	21953.0	NaN	NaN	NaN	NaN
2	Kabambiro	655356	Water For People	0.158667	30.490551	2005-09-07	1	Improved	Uganda	Western	...	1.249238	10.259377	91.358551	4.212975	4.461754	15484.0	NaN	NaN	NaN	NaN
3	Bwizi	655725	Water For People	0.430480	30.751951	2005-09-07	0	Improved	Uganda	Western	...	1.033757	6.879652	95.096440	5.003880	8.693308	28764.0	NaN	NaN	NaN	NaN
4	Kicheche	661237	Water For People	-0.134680	30.351593	2005-09-07	1	Improved	Uganda	Western	...	4.054211	20.201299	87.522511	4.271490	9.606041	26338.0	NaN	NaN	NaN	NaN

	count	mean	std	min	25%	50%	75%	max
distance_to_primary	108906.0	17303.595989	16320.742663	0.169448	4380.71400	13009.2125	26147.321000	113267.730
distance_to_secondary	108906.0	5474.409666	6338.274052	0.034171	1147.75660	3472.1523	7400.876525	48801.574
distance_to_tertiary	108906.0	2846.930649	3547.441610	0.021833	475.77956	1606.8709	3822.951925	33601.773
distance_to_city	108906.0	42050.468544	30474.890401	55.531650	20122.05250	35971.3700	57459.347500	222690.450

	count	mean	std	min	25%	50%	75%	max
distance_to_primary	108906.0	17.303596	16.320743	0.000169	4.380714	13.009213	26.147321	113.267730
distance_to_secondary	108906.0	5.474410	6.338274	0.000034	1.147757	3.472152	7.400877	48.801574
distance_to_tertiary	108906.0	2.846931	3.547442	0.000022	0.475780	1.606871	3.822952	33.601773
distance_to_city	108906.0	42.050469	30.474890	0.055532	20.122052	35.971370	57.459347	222.690450

	lat_deg	lon_deg	status_id	distance_to_primary	distance_to_secondary	distance_to_tertiary	distance_to_city	distance_to_town	usage_cap	staleness_score	...	perc_hh_borehole	perc_hh_toilet	perc_hh_own_house	perc_hh_own_tv	perc_hh_bank_acc	perc_hh_subs_farm	perc_hh_less2meals	perc_hh_electricity	total_fatalities_adm4	total_events_adm4
0	0.158537	30.490643	1	37.666023	4.875484	0.023399	45.104836	32.223873	250.0	14.327719	...	34.360863	1.317831	87.942373	1.249238	10.259377	91.358551	4.212975	4.461754	NaN	NaN
1	0.070597	30.415651	1	35.843273	2.602279	0.971238	38.419560	24.194055	250.0	14.327719	...	13.201162	0.866692	88.482012	2.614604	14.573029	92.319897	4.458575	7.565426	NaN	NaN
2	0.158667	30.490551	1	37.651727	4.860170	0.034916	45.094210	32.236693	250.0	14.327719	...	34.360863	1.317831	87.942373	1.249238	10.259377	91.358551	4.212975	4.461754	NaN	NaN
3	0.430480	30.751951	0	14.592108	16.548334	1.269275	58.369610	23.365746	250.0	14.327719	...	16.919476	2.031564	82.706261	1.033757	6.879652	95.096440	5.003880	8.693308	NaN	NaN
4	-0.134680	30.351593	1	29.073130	6.369318	3.270907	45.044160	16.042707	250.0	14.327719	...	2.450680	0.983537	89.721699	4.054211	20.201299	87.522511	4.271490	9.606041	NaN	NaN

	served_population	local_population	perc_local_served
0	18.5	283.5	6.525573
1	35.0	274.0	12.773723
2	18.5	283.5	6.525573
3	1.0	25.0	4.000000
4	34.0	366.0	9.289617