import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

## create correlation matrix
file = pd.read_excel (r'C:\Users\Ella\Desktop\data challenge\food insecuity.xlsx')
df = file.copy()
del df['Year']
df = df.astype('float64')
corrmatrix = df.corr()
corrmatrix.to_excel('corr.xlsx')

corr = df.corr()
m = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
raw = corr.loc[m, m]

raw

## Linear Model
## Attempt 1:

new = df[['Proportion of population under global poverty line', 
         'Volatility of agricultural production', 
         'Urban absorption capacity', 'Protein quality',  
         ' Ability to store food safely', 
         'Natural Disaster Disbursement']]

target = df['Prevalence of undernourishment population']
del df['Prevalence of undernourishment population']
model = LinearRegression().fit(df, target)

X = np.array(new)
y= np.array(target)
X_1 = sm.add_constant(X,has_constant=True)

results = sm.OLS(y,X_1).fit()
results.summary()

C:\Users\Ella\anaconda3\lib\site-packages\scipy\stats\stats.py:1604: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
  "anyway, n=%i" % int(n))

## Attempt 2:
new2 = df[['Proportion of population under global poverty line', 
         'Volatility of agricultural production','Protein quality',  
         ' Ability to store food safely', 
         'Natural Disaster Disbursement',
         'Agriculture expenditure  in GDP']]

X = np.array(new2)
y= np.array(target)
X_1 = sm.add_constant(X,has_constant=True)

results = sm.OLS(y,X_1).fit()
results.summary()

##print out the coefficients for each predictors:
results.params

array([-1.81969817e+02,  6.20252060e-01, -1.40064213e+02,  2.68887021e+00,
       -1.53245029e-02, -1.11743466e+00,  1.91939763e+00])

	Proportion of population under global poverty line	Volatility of agricultural production	Corruption	Urban absorption capacity	Protein quality	Ability to store food safely	Natural Disaster Disbursement
Proportion of population under global poverty line	1.000000	0.482861	NaN	0.572441	-0.427502	-0.813861	0.831172
Volatility of agricultural production	0.482861	1.000000	NaN	-0.053258	0.048442	-0.534365	0.504151
Corruption	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Urban absorption capacity	0.572441	-0.053258	NaN	1.000000	-0.573563	-0.335449	0.616168
Protein quality	-0.427502	0.048442	NaN	-0.573563	1.000000	0.258852	-0.249978
Ability to store food safely	-0.813861	-0.534365	NaN	-0.335449	0.258852	1.000000	-0.483931
Natural Disaster Disbursement	0.831172	0.504151	NaN	0.616168	-0.249978	-0.483931	1.000000

Dep. Variable:	y	R-squared:	0.994
Model:	OLS	Adj. R-squared:	0.955
Method:	Least Squares	F-statistic:	25.76
Date:	Fri, 23 Oct 2020	Prob (F-statistic):	0.150
Time:	12:43:58	Log-Likelihood:	-2.5404
No. Observations:	8	AIC:	19.08
Df Residuals:	1	BIC:	19.64
Df Model:	6
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	187.9201	35.714	5.262	0.120	-265.866	641.706
x1	-2.1813	0.605	-3.606	0.172	-9.868	5.506
x2	-69.5811	62.277	-1.117	0.465	-860.886	721.724
x3	-0.4515	0.712	-0.634	0.640	-9.502	8.599
x4	-1.7872	0.391	-4.566	0.137	-6.761	3.186
x5	-0.6940	0.147	-4.730	0.133	-2.558	1.170
x6	2.4717	0.529	4.672	0.134	-4.251	9.194

Omnibus:	0.767	Durbin-Watson:	2.282
Prob(Omnibus):	0.681	Jarque-Bera (JB):	0.035
Skew:	-0.142	Prob(JB):	0.983
Kurtosis:	2.842	Cond. No.	1.31e+04

Dep. Variable:	y	R-squared:	0.996
Model:	OLS	Adj. R-squared:	0.975
Method:	Least Squares	F-statistic:	47.27
Date:	Fri, 23 Oct 2020	Prob (F-statistic):	0.111
Time:	12:44:35	Log-Likelihood:	-0.12434
No. Observations:	8	AIC:	14.25
Df Residuals:	1	BIC:	14.80
Df Model:	6
Covariance Type:	nonrobust

Omnibus:	1.103	Durbin-Watson:	2.686
Prob(Omnibus):	0.576	Jarque-Bera (JB):	0.001
Skew:	0.025	Prob(JB):	0.999
Kurtosis:	3.025	Cond. No.	9.12e+04

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-181.9698	285.230	-0.638	0.638	-3806.156	3442.216
x1	0.6203	2.159	0.287	0.822	-26.816	28.056
x2	-140.0642	84.192	-1.664	0.345	-1209.829	929.701
x3	2.6889	3.469	0.775	0.580	-41.392	46.770
x4	-0.0153	0.519	-0.030	0.981	-6.604	6.574
x5	-1.1174	2.708	-0.413	0.751	-35.520	33.285
x6	1.9194	1.534	1.251	0.429	-17.578	21.417