from pathlib import Path
#
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
#
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import OLSInfluence
#
from pygam import LinearGAM, s, l
from pygam.datasets import wage
#
import seaborn as sns
import matplotlib.pyplot as plt
#
from dmba import stepwise_selection
from dmba import AIC_score
O diretório DATA contém os arquivos .csv utilizados nos exemplos.
DATA = './'
Se você não mantiver seus dados no mesmo diretório que o código, adapte os nomes dos caminhos.
LUNG_CSV = DATA + 'LungDisease.csv'
HOUSE_CSV = DATA + 'house_sales.csv'
lung = pd.read_csv(LUNG_CSV)
print(lung.head())
Gráfico de dispersão:
lung.plot.scatter(x='Exposure', y='PEFR')
plt.tight_layout()
plt.show()
Podemos usar o modelo LinearRegression do _Scikit-Learn_.
predictors = ['Exposure']
outcome = 'PEFR'
#
model = LinearRegression()
model.fit(lung[predictors], lung[outcome])
#
print(f'Intercept: {model.intercept_:.3f}')
print(f'Coefficient Exposure: {model.coef_[0]:.3f}')
Gráfico:
fig, ax = plt.subplots(figsize=(4, 4))
ax.set_xlim(0, 23)
ax.set_ylim(295, 450)
ax.set_xlabel('Exposure')
ax.set_ylabel('PEFR')
ax.plot((0, 23), model.predict(pd.DataFrame({'Exposure': [0, 23]})))
ax.text(0.4, model.intercept_, r'$b_0$', size='larger')
#
x = pd.DataFrame({'Exposure': [7.5,17.5]})
y = model.predict(x)
ax.plot((7.5, 7.5, 17.5), (y[0], y[1], y[1]), '--')
ax.text(5, np.mean(y), r'$\Delta Y$', size='larger')
ax.text(12, y[1] - 10, r'$\Delta X$', size='larger')
ax.text(12, 390, r'$b_1 = \frac{\Delta Y}{\Delta X}$', size='larger')
#
plt.tight_layout()
plt.show()
O método predict de um modelo _Scikit-Learn_ ajustado pode ser usado para prever novos pontos de dados.
fitted = model.predict(lung[predictors])
residuals = lung[outcome] - fitted
#
ax = lung.plot.scatter(x='Exposure', y='PEFR', figsize=(4, 4))
ax.plot(lung.Exposure, fitted)
for x, yactual, yfitted in zip(lung.Exposure, lung.PEFR, fitted):
ax.plot((x, x), (yactual, yfitted), '--', color='C1')
#
plt.tight_layout()
plt.show()
Carregamento do conjunto de dados house:
house = pd.read_csv(HOUSE_CSV, sep='\t')
print(house.head())
#
subset_house = [
'AdjSalePrice',
'SqFtTotLiving',
'SqFtLot',
'Bathrooms',
'Bedrooms',
'BldgGrade']
print(house[subset_house].head())
#
outcome_house = 'AdjSalePrice'
#
#
predictors = [
'SqFtTotLiving',
'SqFtLot',
'Bathrooms',
'Bedrooms',
'BldgGrade']
#
house_lm = LinearRegression()
house_lm.fit(house[predictors], house[outcome_house])
#
print(f'Intercept: {house_lm.intercept_:.3f}')
print('Coefficients:')
for name, coef in zip(predictors, house_lm.coef_):
print(f' {name}: {coef}')
O método r2_score doScikit-Learn fornece uma série de métricas para determinar a qualidade de um modelo.
fitted = house_lm.predict(house[predictors])
RMSE = np.sqrt(mean_squared_error(house[outcome_house], fitted))
r2 = r2_score(house[outcome_house], fitted)
print(f'RMSE: {RMSE:.0f}')
print(f'r2: {r2:.4f}')
Enquanto _Scikit-Learn_ fornece uma variedade de métricas diferentes, _statsmodels_ fornece uma análise mais aprofundada do modelo de regressão linear.
Este pacote tem duas maneiras diferentes de especificar o modelo, uma que é semelhante a _Scikit-Learn_ e outra que permite especificar fórmulas no estilo _R_.
Aqui usamos a primeira abordagem. Como _statsmodels_ não adiciona uma interceptação automaticamente, precisamos adicionar uma coluna constante com valor 1 aos preditores.
Podemos usar o método _pandas_ assign para isso.
model = sm.OLS(house[outcome_house], house[predictors].assign(const=1))
results = model.fit()
print(results.summary())
predictors = [
'SqFtTotLiving', 'SqFtLot',
'Bathrooms', 'Bedrooms',
'BldgGrade', 'PropertyType',
'NbrLivingUnits', 'SqFtFinBasement',
'YrBuilt', 'YrRenovated',
'NewConstruction']
X = pd.get_dummies(house[predictors], drop_first=True)
X['NewConstruction'] = [1 if nc else 0 for nc in X['NewConstruction']]
house_full = sm.OLS(house[outcome_house], X.assign(const=1))
results = house_full.fit()
print(results.summary())
Podemos usar o método stepwise_selection do pacote _dmba_.
y = house[outcome_house]
#
def train_model(variables):
if len(variables) == 0:
return None
model = LinearRegression()
model.fit(X[variables], y)
return model
#
def score_model(model, variables):
if len(variables) == 0:
return AIC_score(y, [y.mean()] * len(y), model, df=1)
return AIC_score(y, model.predict(X[variables]), model)
#
best_model, best_variables = stepwise_selection(
X.columns, train_model,
score_model, verbose=True)
#
print()
print(f'Intercept: {best_model.intercept_:.3f}')
print('Coefficients:')
z = zip(best_variables, best_model.coef_)
for name, coef in z:
print(f' {name}: {coef}')
Podemos calcular o ano a partir da coluna de data usando uma compreensão de lista ou o método apply do quadro de dados.
house['Year'] = [int(date.split('-')[0]) for date in house.DocumentDate]
house['Year'] = house.DocumentDate.apply(lambda d: int(d.split('-')[0]))
house['Weight'] = house.Year - 2005
#
predictors = [
'SqFtTotLiving', 'SqFtLot', 'Bathrooms',
'Bedrooms', 'BldgGrade']
#
house_wt = LinearRegression()
house_wt.fit(house[predictors], house[outcome_house], sample_weight=house.Weight)
#
pd.concat([
pd.DataFrame({
'predictor': predictors,
'house_lm': house_lm.coef_,
'house_wt': house_wt.coef_,
}),
pd.DataFrame({
'predictor': ['intercept'],
'house_lm': house_lm.intercept_,
'house_wt': house_wt.intercept_,
})
])
#
residuals = pd.DataFrame({
'abs_residual_lm': np.abs(house_lm.predict(house[predictors]) - house[outcome_house]),
'abs_residual_wt': np.abs(house_wt.predict(house[predictors]) - house[outcome_house]),
'Year': house['Year'],
})
print(residuals.head())
# axes = residuals.boxplot(['abs_residual_lm', 'abs_residual_wt'], by='Year', figsize=(10, 4))
# axes[0].set_ylim(0, 300000)
#
pd.DataFrame(([year, np.mean(group['abs_residual_lm']), np.mean(group['abs_residual_wt'])]
for year, group in residuals.groupby('Year')),
columns=['Year', 'mean abs_residual_lm', 'mean abs_residual_wt'])
# for year, group in residuals.groupby('Year'):
# print(year, np.mean(group['abs_residual_lm']), np.mean(group['abs_residual_wt']))
#
print(house.PropertyType.head())
print(pd.get_dummies(house['PropertyType']).head(6))
print(pd.get_dummies(house['PropertyType'], drop_first=True).head(6))
predictors = [
'SqFtTotLiving', 'SqFtLot',
'Bathrooms', 'Bedrooms',
'BldgGrade', 'PropertyType']
#
X = pd.get_dummies(house[predictors], drop_first=True)
#
house_lm_factor = LinearRegression()
house_lm_factor.fit(X, house[outcome_house])
#
print(f'Intercept: {house_lm_factor.intercept_:.3f}')
print('Coefficients:')
for name, coef in zip(X.columns, house_lm_factor.coef_):
print(f' {name}: {coef}')
print(pd.DataFrame(house['ZipCode'].value_counts()).transpose())
house = pd.read_csv(HOUSE_CSV, sep='\t')
predictors = [
'SqFtTotLiving',
'SqFtLot',
'Bathrooms',
'Bedrooms',
'BldgGrade']
#
house_lm = LinearRegression()
house_lm.fit(house[predictors], house[outcome_house])
#
zip_groups = pd.DataFrame([
*pd.DataFrame({
'ZipCode': house['ZipCode'],
'residual' : house[outcome_house] - house_lm.predict(house[predictors]),
})
.groupby(['ZipCode'])
.apply(lambda x: {
'ZipCode': x.iloc[0,0],
'count': len(x),
'median_residual': x.residual.median()
})
]).sort_values('median_residual')
zip_groups['cum_count'] = np.cumsum(zip_groups['count'])
zip_groups['ZipGroup'] = pd.qcut(
zip_groups['cum_count'], 5,
labels=False, retbins=False)
zip_groups.head()
print(zip_groups.ZipGroup.value_counts().sort_index())
#
to_join = zip_groups[['ZipCode', 'ZipGroup']].set_index('ZipCode')
house = house.join(to_join, on='ZipCode')
house['ZipGroup'] = house['ZipGroup'].astype('category')
print(house['ZipGroup'])
Os resultados da regressão stepwise são:
print(f'Intercept: {best_model.intercept_:.3f}')
print('Coefficients:')
for name, coef in zip(best_variables, best_model.coef_):
print(f' {name}: {coef}')
#
predictors = [
'Bedrooms',
'BldgGrade',
'PropertyType',
'YrBuilt']
#
X = pd.get_dummies(house[predictors], drop_first=True)
#
reduced_lm = LinearRegression()
reduced_lm.fit(X, house[outcome_house])
#
print(f'Intercept: {reduced_lm.intercept_:.3f}')
print('Coefficients:')
for name, coef in zip(X.columns, reduced_lm.coef_):
print(f' {name}: {coef}')
predictors = [
'SqFtTotLiving',
'SqFtLot',
'Bathrooms',
'Bedrooms',
'BldgGrade',
'PropertyType',
'ZipGroup']
#
X = pd.get_dummies(house[predictors], drop_first=True)
#
confounding_lm = LinearRegression()
confounding_lm.fit(X, house[outcome_house])
#
print(f'Intercept: {confounding_lm.intercept_:.3f}')
print('Coefficients:')
for name, coef in zip(X.columns, confounding_lm.coef_):
print(f' {name}: {coef}')
model = smf.ols(formula='AdjSalePrice ~ SqFtTotLiving * ZipGroup + SqFtLot + ' +
'Bathrooms + Bedrooms + BldgGrade + PropertyType', data=house)
results = model.fit()
print(results.summary())
O pacote _statsmodels_ possui o suporte mais desenvolvido para análise de outliers.
house_98105 = house.loc[house['ZipCode'] == 98105, ]
#
predictors = [
'SqFtTotLiving',
'SqFtLot',
'Bathrooms',
'Bedrooms',
'BldgGrade']
#
house_outlier = sm.OLS(house_98105[outcome_house], house_98105[predictors].assign(const=1))
result_98105 = house_outlier.fit()
print(result_98105.summary())
A classe OLSInfluence é inicializada com os resultados da regressão OLS e dá acesso a várias propriedades úteis.
Aqui usamos os resíduos estudantis.
influence = OLSInfluence(result_98105)
sresiduals = influence.resid_studentized_internal
outlier = house_98105.loc[sresiduals.idxmin(), :]
#
print(sresiduals.idxmin(), sresiduals.min())
print(result_98105.resid.loc[sresiduals.idxmin()])
print('AdjSalePrice', outlier[outcome])
print(outlier[predictors])
from scipy.stats import linregress
#
np.random.seed(5)
x = np.random.normal(size=25)
y = -x / 5 + np.random.normal(size=25)
x[0] = 8
y[0] = 8
#
def abline(slope, intercept, ax):
# Calculate coordinates of a line based on slope and intercept
x_vals = np.array(ax.get_xlim())
return (x_vals, intercept + slope * x_vals)
#
fig, ax = plt.subplots(figsize=(4, 4))
ax.scatter(x, y)
slope, intercept, _, _, _ = linregress(x, y)
ax.plot(*abline(slope, intercept, ax))
slope, intercept, _, _, _ = linregress(x[1:], y[1:])
ax.plot(*abline(slope, intercept, ax), '--')
ax.set_xlim(-2.5, 8.5)
ax.set_ylim(-2.5, 8.5)
#
plt.tight_layout()
plt.show()
O pacote _statsmodel_ fornece vários gráficos para analisar a influência do ponto de dados
influence = OLSInfluence(result_98105)
fig, ax = plt.subplots(figsize=(5, 5))
ax.axhline(-2.5, linestyle='--', color='C1')
ax.axhline(2.5, linestyle='--', color='C1')
ax.scatter(influence.hat_matrix_diag, influence.resid_studentized_internal,
s=1000 * np.sqrt(influence.cooks_distance[0]),
alpha=0.5)
#
ax.set_xlabel('hat values')
ax.set_ylabel('studentized residuals')
#
plt.tight_layout()
plt.show()
mask = [dist < .08 for dist in influence.cooks_distance[0]]
house_infl = house_98105.loc[mask]
ols_infl = sm.OLS(house_infl[outcome], house_infl[predictors].assign(const=1))
result_infl = ols_infl.fit()
#df = pd.DataFrame({
# 'Original': result_98105.params,
# 'Influential removed': result_infl.params,
#})
#print(df)
O regplot em Seaborn permite adicionar uma linha de baixa suavização ao gráfico de dispersão.
fig, ax = plt.subplots(figsize=(5, 5))
sns.regplot(
x=result_98105.fittedvalues,
y=np.abs(result_98105.resid),
scatter_kws={'alpha': 0.25},
line_kws={'color': 'C1'},
lowess=True, ax=ax)
ax.set_xlabel('predicted')
ax.set_ylabel('abs(residual)')
#
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(figsize=(4, 4))
pd.Series(influence.resid_studentized_internal).hist(ax=ax)
ax.set_xlabel('std. residual')
ax.set_ylabel('Frequency')
#
plt.tight_layout()
plt.show()
Método plot_ccpr:
fig, ax = plt.subplots(figsize=(5, 5))
fig = sm.graphics.plot_ccpr(result_98105, 'SqFtTotLiving', ax=ax)
#
plt.tight_layout()
plt.show()
Método plot_ccpr_grid:
fig = sm.graphics.plot_ccpr_grid(
result_98105,
fig=plt.figure(figsize=(8,12)))
plt.show()
model_poly = smf.ols(formula='AdjSalePrice ~ SqFtTotLiving + np.power(SqFtTotLiving, 2) + ' +
'SqFtLot + Bathrooms + Bedrooms + BldgGrade', data=house_98105)
result_poly = model_poly.fit()
print(result_poly.summary())
A implementação de statsmodels de um gráfico de resíduo parcial funciona apenas para termo linear.
Aqui está uma implementação de um gráfico residual parcial que, embora ineficiente, funciona para a regressão polinomial.
def partialResidualPlot(model, df, outcome, feature, ax):
y_pred = model.predict(df)
copy_df = df.copy()
for c in copy_df.columns:
if c == feature:
continue
copy_df[c] = 0.0
feature_prediction = model.predict(copy_df)
results = pd.DataFrame({
'feature': df[feature],
'residual': df[outcome] - y_pred,
'ypartial': feature_prediction - model.params[0],
})
results = results.sort_values(by=['feature'])
smoothed = sm.nonparametric.lowess(results.ypartial, results.feature, frac=1/3)
#
ax.scatter(results.feature, results.ypartial + results.residual)
ax.plot(smoothed[:, 0], smoothed[:, 1], color='gray')
ax.plot(results.feature, results.ypartial, color='black')
ax.set_xlabel(feature)
ax.set_ylabel(f'Residual + {feature} contribution')
return ax
#
fig, ax = plt.subplots(figsize=(5, 5))
partialResidualPlot(result_poly, house_98105, 'AdjSalePrice', 'SqFtTotLiving', ax)
#
plt.tight_layout()
plt.show()
Resultado result_poly.params[2]:
print(result_poly.params[2])
formula = ('AdjSalePrice ~ bs(SqFtTotLiving, df=6, degree=3) + ' +
'SqFtLot + Bathrooms + Bedrooms + BldgGrade')
model_spline = smf.ols(formula=formula, data=house_98105)
result_spline = model_spline.fit()
print(result_spline.summary())
Figura:
fig, ax = plt.subplots(figsize=(5, 5))
partialResidualPlot(
result_spline, house_98105,
'AdjSalePrice', 'SqFtTotLiving', ax)
plt.tight_layout()
plt.show()
predictors = [
'SqFtTotLiving', 'SqFtLot', 'Bathrooms',
'Bedrooms', 'BldgGrade']
#
X = house_98105[predictors].values
y = house_98105[outcome_house]
#
## model
#
gam = LinearGAM(s(0, n_splines=12) + l(1) + l(2) + l(3) + l(4))
gam.gridsearch(X, y)
#
print(gam.summary())
Figura:
fig, axes = plt.subplots(figsize=(8, 8), ncols=2, nrows=3)
#
titles = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade']
for i, title in enumerate(titles):
ax = axes[i // 2, i % 2]
XX = gam.generate_X_grid(term=i)
ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX))
ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--')
ax.set_title(titles[i]);
#
axes[2][1].set_visible(False)
#
plt.tight_layout()
plt.show()
AttributeError: GAM has not been fitted. Call fit first. from sklearn.linear_model import Lasso, LassoLars, LassoCV, LassoLarsCV
from sklearn.preprocessing import StandardScaler
#
predictors = [
'SqFtTotLiving','SqFtLot','Bathrooms',
'Bedrooms','BldgGrade','PropertyType',
'NbrLivingUnits','SqFtFinBasement',
'YrBuilt','YrRenovated',
'NewConstruction']
#
X = pd.get_dummies(house[predictors], drop_first=True)
X['NewConstruction'] = [1 if nc else 0 for nc in X['NewConstruction']]
#
columns = X.columns
# X = StandardScaler().fit_transform(X * 1.0)
y = house[outcome_house]
#
house_lm = LinearRegression()
print(house_lm.fit(X, y))
#
house_lasso = Lasso(alpha=10)
print(house_lasso.fit(X, y))
Method = LassoLars
MethodCV = LassoLarsCV
Method = Lasso
MethodCV = LassoCV
#
alpha_values = []
results = []
for alpha in [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]:
model = Method(alpha=alpha)
model.fit(X, y)
alpha_values.append(alpha)
results.append(model.coef_)
modelCV = MethodCV(cv=5)
modelCV.fit(X, y)
ax = pd.DataFrame(results, index=alpha_values, columns=columns).plot(logx=True, legend=False)
ax.axvline(modelCV.alpha_)
plt.show()
print(pd.DataFrame({
'name': columns,
'coef': modelCV.coef_,}))
# Intercept: 6177658.144
# Coefficients:
# SqFtTotLiving: 199.27474217544048
# BldgGrade: 137181.13724627026
# YrBuilt: -3564.934870415041
# Bedrooms: -51974.76845567939
# Bathrooms: 42403.059999677665
# PropertyType_Townhouse: 84378.9333363999
# SqFtFinBasement: 7.032178917565108
# PropertyType_Single Family: 22854.87954019308