import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
## create correlation matrix
file = pd.read_excel (r'C:\Users\Ella\Desktop\data challenge\food insecuity.xlsx')
df = file.copy()
del df['Year']
df = df.astype('float64')
corrmatrix = df.corr()
corrmatrix.to_excel('corr.xlsx')
corr = df.corr()
m = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
raw = corr.loc[m, m]
raw
## Linear Model
## Attempt 1:
new = df[['Proportion of population under global poverty line',
'Volatility of agricultural production',
'Urban absorption capacity', 'Protein quality',
' Ability to store food safely',
'Natural Disaster Disbursement']]
target = df['Prevalence of undernourishment population']
del df['Prevalence of undernourishment population']
model = LinearRegression().fit(df, target)
X = np.array(new)
y= np.array(target)
X_1 = sm.add_constant(X,has_constant=True)
results = sm.OLS(y,X_1).fit()
results.summary()
## Attempt 2:
new2 = df[['Proportion of population under global poverty line',
'Volatility of agricultural production','Protein quality',
' Ability to store food safely',
'Natural Disaster Disbursement',
'Agriculture expenditure in GDP']]
X = np.array(new2)
y= np.array(target)
X_1 = sm.add_constant(X,has_constant=True)
results = sm.OLS(y,X_1).fit()
results.summary()
##print out the coefficients for each predictors:
results.params