In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
In [3]:
## create correlation matrix
file = pd.read_excel (r'C:\Users\Ella\Desktop\data challenge\food insecuity.xlsx')
df = file.copy()
del df['Year']
df = df.astype('float64')
corrmatrix = df.corr()
corrmatrix.to_excel('corr.xlsx')
In [5]:
corr = df.corr()
m = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
raw = corr.loc[m, m]
In [6]:
raw
Out[6]:
Proportion of population under global poverty line Volatility of agricultural production Corruption Urban absorption capacity Protein quality Ability to store food safely Natural Disaster Disbursement
Proportion of population under global poverty line 1.000000 0.482861 NaN 0.572441 -0.427502 -0.813861 0.831172
Volatility of agricultural production 0.482861 1.000000 NaN -0.053258 0.048442 -0.534365 0.504151
Corruption NaN NaN NaN NaN NaN NaN NaN
Urban absorption capacity 0.572441 -0.053258 NaN 1.000000 -0.573563 -0.335449 0.616168
Protein quality -0.427502 0.048442 NaN -0.573563 1.000000 0.258852 -0.249978
Ability to store food safely -0.813861 -0.534365 NaN -0.335449 0.258852 1.000000 -0.483931
Natural Disaster Disbursement 0.831172 0.504151 NaN 0.616168 -0.249978 -0.483931 1.000000
In [7]:
## Linear Model
## Attempt 1:

new = df[['Proportion of population under global poverty line', 
         'Volatility of agricultural production', 
         'Urban absorption capacity', 'Protein quality',  
         ' Ability to store food safely', 
         'Natural Disaster Disbursement']]

target = df['Prevalence of undernourishment population']
del df['Prevalence of undernourishment population']
model = LinearRegression().fit(df, target)

X = np.array(new)
y= np.array(target)
X_1 = sm.add_constant(X,has_constant=True)

results = sm.OLS(y,X_1).fit()
results.summary()
C:\Users\Ella\anaconda3\lib\site-packages\scipy\stats\stats.py:1604: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
  "anyway, n=%i" % int(n))
Out[7]:
OLS Regression Results
Dep. Variable: y R-squared: 0.994
Model: OLS Adj. R-squared: 0.955
Method: Least Squares F-statistic: 25.76
Date: Fri, 23 Oct 2020 Prob (F-statistic): 0.150
Time: 12:43:58 Log-Likelihood: -2.5404
No. Observations: 8 AIC: 19.08
Df Residuals: 1 BIC: 19.64
Df Model: 6
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 187.9201 35.714 5.262 0.120 -265.866 641.706
x1 -2.1813 0.605 -3.606 0.172 -9.868 5.506
x2 -69.5811 62.277 -1.117 0.465 -860.886 721.724
x3 -0.4515 0.712 -0.634 0.640 -9.502 8.599
x4 -1.7872 0.391 -4.566 0.137 -6.761 3.186
x5 -0.6940 0.147 -4.730 0.133 -2.558 1.170
x6 2.4717 0.529 4.672 0.134 -4.251 9.194
Omnibus: 0.767 Durbin-Watson: 2.282
Prob(Omnibus): 0.681 Jarque-Bera (JB): 0.035
Skew: -0.142 Prob(JB): 0.983
Kurtosis: 2.842 Cond. No. 1.31e+04


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.31e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [8]:
## Attempt 2:
new2 = df[['Proportion of population under global poverty line', 
         'Volatility of agricultural production','Protein quality',  
         ' Ability to store food safely', 
         'Natural Disaster Disbursement',
         'Agriculture expenditure  in GDP']]

X = np.array(new2)
y= np.array(target)
X_1 = sm.add_constant(X,has_constant=True)

results = sm.OLS(y,X_1).fit()
results.summary()
Out[8]:
OLS Regression Results
Dep. Variable: y R-squared: 0.996
Model: OLS Adj. R-squared: 0.975
Method: Least Squares F-statistic: 47.27
Date: Fri, 23 Oct 2020 Prob (F-statistic): 0.111
Time: 12:44:35 Log-Likelihood: -0.12434
No. Observations: 8 AIC: 14.25
Df Residuals: 1 BIC: 14.80
Df Model: 6
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const -181.9698 285.230 -0.638 0.638 -3806.156 3442.216
x1 0.6203 2.159 0.287 0.822 -26.816 28.056
x2 -140.0642 84.192 -1.664 0.345 -1209.829 929.701
x3 2.6889 3.469 0.775 0.580 -41.392 46.770
x4 -0.0153 0.519 -0.030 0.981 -6.604 6.574
x5 -1.1174 2.708 -0.413 0.751 -35.520 33.285
x6 1.9194 1.534 1.251 0.429 -17.578 21.417
Omnibus: 1.103 Durbin-Watson: 2.686
Prob(Omnibus): 0.576 Jarque-Bera (JB): 0.001
Skew: 0.025 Prob(JB): 0.999
Kurtosis: 3.025 Cond. No. 9.12e+04


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.12e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [9]:
##print out the coefficients for each predictors:
results.params
Out[9]:
array([-1.81969817e+02,  6.20252060e-01, -1.40064213e+02,  2.68887021e+00,
       -1.53245029e-02, -1.11743466e+00,  1.91939763e+00])