from pathlib import Path
import pandas as pd
import numpy as np
#
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression #, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.metrics import roc_curve, accuracy_score, roc_auc_score
#
import statsmodels.api as sm
#
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from pygam import LinearGAM, s, f, l
#
from dmba import classificationSummary
#
import seaborn as sns
import matplotlib.pyplot as plt
O diretório DATA contém os arquivos .csv utilizados nos exemplos.
DATA = './'
Se você não mantiver seus dados no mesmo diretório que o código, adapte os nomes dos caminhos.
LOAN3000_CSV = DATA + 'loan3000.csv'
LOAN_DATA_CSV = DATA + 'loan_data.csv.gz'
FULL_TRAIN_SET_CSV = DATA + 'full_train_set.csv.gz'
Carregamos o conjunto de dados de empréstimos.
loan_data = pd.read_csv(LOAN_DATA_CSV)
print(loan_data.head())
convertemos dos dados para categóricos.
loan_data.outcome = loan_data.outcome.astype('category')
loan_data.outcome.cat.reorder_categories(['paid off', 'default'])
loan_data.purpose_ = loan_data.purpose_.astype('category')
loan_data.home_ = loan_data.home_.astype('category')
loan_data.emp_len_ = loan_data.emp_len_.astype('category')
Calculamos as probabilidades.
predictors = ['purpose_', 'home_', 'emp_len_']
outcome = 'outcome'
X = pd.get_dummies(loan_data[predictors], prefix='', prefix_sep='')
y = loan_data[outcome]
#
naive_model = MultinomialNB(alpha=0.01, fit_prior=True)
naive_model = MultinomialNB(alpha=1e-10, fit_prior=False)
naive_model.fit(X, y)
#
new_loan = X.loc[146:146, :]
print('classe prevista: ', naive_model.predict(new_loan)[0])
#
probabilities = pd.DataFrame(
naive_model.predict_proba(new_loan),
columns=naive_model.classes_)
print('probabilidades previstas',)
print(probabilities)
Exemplo simples:
loan3000 = pd.read_csv(LOAN3000_CSV)
print(loan3000.head())
Alteramos a propriedade outcome.
loan3000.outcome = loan3000.outcome.astype('category')
print(loan3000.head())
Calculamos o LDA do empréstimo.
predictors = ['borrower_score', 'payment_inc_ratio']
outcome = 'outcome'
#
X = loan3000[predictors]
y = loan3000[outcome]
#
loan_lda = LinearDiscriminantAnalysis()
loan_lda.fit(X, y)
print(pd.DataFrame(loan_lda.scalings_, index=X.columns))
#
pred = pd.DataFrame(
loan_lda.predict_proba(loan3000[predictors]),
columns=loan_lda.classes_)
print(pred.head())
Figura 5-1: use escalas e centro de meios para determinar o limite de decisão.
center = np.mean(loan_lda.means_, axis=0)
slope = - loan_lda.scalings_[0] / loan_lda.scalings_[1]
intercept = center[1] - center[0] * slope
#
# payment_inc_ratio para borrower_score de 0 e 20
#
x_0 = (0 - intercept) / slope
x_20 = (20 - intercept) / slope
#
lda_df = pd.concat([loan3000, pred['default']], axis=1)
print(lda_df.head())
Figura:
fig, ax = plt.subplots(figsize=(4, 4))
g = sns.scatterplot(
x='borrower_score', y='payment_inc_ratio',
hue='default', data=lda_df,
palette=sns.diverging_palette(240, 10, n=9, as_cmap=True),
ax=ax, legend=False)
#
ax.set_ylim(0, 20)
ax.set_xlim(0.15, 0.8)
ax.plot((x_0, x_20), (0, 20), linewidth=3)
ax.plot(*loan_lda.means_.transpose())
#
plt.tight_layout()
plt.show()
p = np.arange(0.01, 1, 0.01)
df = pd.DataFrame({
'p': p,
'logit': np.log(p / (1 - p)),
'odds': p / (1 - p),
})
#
fig, ax = plt.subplots(figsize=(3, 3))
ax.axhline(0, color='grey', linestyle='--')
ax.axvline(0.5, color='grey', linestyle='--')
ax.plot(df['p'], df['logit'])
ax.set_xlabel('Probability')
ax.set_ylabel('logit(p)')
#
plt.tight_layout()
plt.show()
O pacote Scikit-Learn possui a classe LogisticRegression especializada para regressão logística.
Statsmodels tem um método mais geral baseado no modelo linear generalizado (GLM, generalized linear model).
predictors = [
'payment_inc_ratio',
'purpose_',
'home_',
'emp_len_',
'borrower_score']
#
outcome = 'outcome'
#
X = pd.get_dummies(
loan_data[predictors],
prefix='',
prefix_sep='',
drop_first=True)
y = loan_data[outcome] # .cat.categories
#
logit_reg = LogisticRegression(
penalty='l2',
C=1e42,
solver='liblinear')
#
logit_reg.fit(X, y)
#
print('intercept ', logit_reg.intercept_[0])
print('classes', logit_reg.classes_)
print(pd.DataFrame({'coeff': logit_reg.coef_[0]},index=X.columns))
Observe que o intercepto e os coeficientes são invertidos em comparação com o modelo R.
print(loan_data['purpose_'].cat.categories)
print(loan_data['home_'].cat.categories)
print(loan_data['emp_len_'].cat.categories)
Não está no livro:
Se você tiver um recurso ou variável de resultado que seja ordinal, use o OrdinalEncoder do Scikit-Learn para substituir as categorias (campos 'paid off' e 'default') por números.
No código abaixo, substituímos 'paid off' por 0 e 'default' por 1.
Isso inverte a ordem das classes previstas e, como consequência, os coeficientes serão invertidos.
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories=[['paid off', 'default']])
y_enc = enc.fit_transform(loan_data[[outcome]]).ravel()
#
logit_reg_enc = LogisticRegression(
penalty="l2",
C=1e42,
solver='liblinear')
logit_reg_enc.fit(X, y_enc)
#
print('intercept ', logit_reg_enc.intercept_[0])
print('classes', logit_reg_enc.classes_)
print(pd.DataFrame({'coeff': logit_reg_enc.coef_[0]},index=X.columns))
pred = pd.DataFrame(
logit_reg.predict_log_proba(X),
columns=logit_reg.classes_)
print("predict_log_proba: ", pred.describe())
#
pred = pd.DataFrame(
logit_reg.predict_proba(X),
columns=logit_reg.classes_)
print("predict_proba: ", pred.describe())
fig, ax = plt.subplots(figsize=(3, 3))
ax.plot(df['logit'], df['odds'])
ax.set_xlabel('log(razões ímpares)')
ax.set_ylabel('razões ímpares')
ax.set_xlim(0, 5.1)
ax.set_ylim(-5, 105)
#
plt.tight_layout()
plt.show()
Para comparação, aqui o modelo GLM usando o módulo statsmodels.
Este método requer que o resultado seja mapeado para números.
Use GLM (modelo linear geral) com a família binomial para ajustar uma regressão .
y_numbers = [1 if yi == 'default' else 0 for yi in y]
logit_reg_sm = sm.GLM(
y_numbers, X.assign(const=1),
family=sm.families.Binomial())
logit_result = logit_reg_sm.fit()
print("logit_reg_sm: ", logit_result.summary())
import statsmodels.formula.api as smf
formula = ('outcome ~ bs(payment_inc_ratio, df=8) + purpose_ + ' +
'home_ + emp_len_ + bs(borrower_score, df=3)')
model = smf.glm(formula=formula, data=loan_data, family=sm.families.Binomial())
results = model.fit()
print(results.summary())
Gráfico:
from statsmodels.genmod.generalized_linear_model import GLMResults
def partialResidualPlot(model, df, outcome, feature, fig, ax):
y_actual = [0 if s == 'default' else 1 for s in df[outcome]]
y_pred = model.predict(df)
org_params = model.params.copy()
zero_params = model.params.copy()
# set model parametes of other features to 0
for i, name in enumerate(zero_params.index):
if feature in name:
continue
zero_params[i] = 0.0
model.initialize(model.model, zero_params)
feature_prediction = model.predict(df)
ypartial = -np.log(1/feature_prediction - 1)
ypartial = ypartial - np.mean(ypartial)
model.initialize(model.model, org_params)
results = pd.DataFrame({
'feature': df[feature],
'residual': -2 * (y_actual - y_pred),
'ypartial': ypartial/ 2,
})
results = results.sort_values(by=['feature'])
#
ax.scatter(results.feature, results.residual, marker=".", s=72./fig.dpi)
ax.plot(results.feature, results.ypartial, color='black')
ax.set_xlabel(feature)
ax.set_ylabel(f'Residual + {feature} contribution')
return ax
#
fig, ax = plt.subplots(figsize=(5, 5))
partialResidualPlot(results, loan_data, 'outcome', 'payment_inc_ratio', fig, ax)
ax.set_xlim(0, 25)
ax.set_ylim(-2.5, 2.5)
#
plt.tight_layout()
plt.show()
pred = logit_reg.predict(X)
pred_y = logit_reg.predict(X) == 'default'
true_y = y == 'default'
true_pos = true_y & pred_y
true_neg = ~true_y & ~pred_y
false_pos = ~true_y & pred_y
false_neg = true_y & ~pred_y
#
conf_mat = pd.DataFrame([[np.sum(true_pos), np.sum(false_neg)], [np.sum(false_pos), np.sum(true_neg)]],
index=['Y = default', 'Y = paid off'],
columns=['Yhat = default', 'Yhat = paid off'])
print(conf_mat)
#
print(confusion_matrix(y, logit_reg.predict(X)))
O pacote _dmba_ contém a função `classificationSummary` que imprime matriz de confusão e precisão para um modelo de classificação.
print(classificationSummary(
y,
logit_reg.predict(X),
class_names=logit_reg.classes_))
A função _Scikit-Learn_ `precision_recall_fscore_support` retorna precisão, recall, fbeta_score e suporte.
conf_mat = confusion_matrix(y, logit_reg.predict(X))
#
print('Precisão (Precision)', conf_mat[0, 0] / sum(conf_mat[:, 0]))
print('Rechamada (Recall)', conf_mat[0, 0] / sum(conf_mat[0, :]))
print('Especificidade(Specificity)', conf_mat[1, 1] / sum(conf_mat[1, :]))
#
prfs = precision_recall_fscore_support(
y,
logit_reg.predict(X),
labels=['default', 'paid off'])
print(prfs)
A função `roc_curve` em _Scikit-Learn_ calcula todas as informações necessárias para traçar uma curva ROC.
fpr, tpr, thresholds = roc_curve(
y,
logit_reg.predict_proba(X)[:, 0],
pos_label='default')
roc_df = pd.DataFrame({'recall': tpr, 'specificity': 1 - fpr})
ax = roc_df.plot(x='specificity', y='recall', figsize=(4, 4), legend=False)
ax.set_ylim(0, 1)
ax.set_xlim(1, 0)
ax.plot((1, 0), (0, 1))
ax.set_xlabel('specificity')
ax.set_ylabel('recall')
plt.tight_layout()
plt.show()
Accuracy can easily be calculated using the _Scikit-Learn_ function `accuracy_score`.
print(np.sum(roc_df.recall[:-1] * np.diff(1 - roc_df.specificity)))
print(roc_auc_score([1 if yi == 'default' else 0 for yi in y], logit_reg.predict_proba(X)[:, 0]))
fpr, tpr, thresholds = roc_curve(y, logit_reg.predict_proba(X)[:,0],
pos_label='default')
roc_df = pd.DataFrame({'recall': tpr, 'specificity': 1 - fpr})
ax = roc_df.plot(x='specificity', y='recall', figsize=(4, 4), legend=False)
ax.set_ylim(0, 1)
ax.set_xlim(1, 0)
# ax.plot((1, 0), (0, 1))
ax.set_xlabel('specificity')
ax.set_ylabel('recall')
ax.fill_between(roc_df.specificity, 0, roc_df.recall, alpha=0.3)
plt.tight_layout()
plt.show()
0.6917107933430462
0.691710871167958
Os resultados baseados em modelos são de magnitude semelhante.
full_train_set = pd.read_csv(FULL_TRAIN_SET_CSV)
print(full_train_set.shape)
#
print(
'porcentagem de empréstimos inadimplentes: ',
100 * np.mean(full_train_set.outcome == 'default'))
#
predictors = [
'payment_inc_ratio', 'purpose_', 'home_', 'emp_len_',
'dti', 'revol_bal', 'revol_util']
outcome = 'outcome'
X = pd.get_dummies(
full_train_set[predictors], prefix='',
prefix_sep='', drop_first=True)
y = full_train_set[outcome]
full_model = LogisticRegression(
penalty='l2', C=1e42, solver='liblinear')
full_model.fit(X, y)
print(
'porcentagem de empréstimos previstos para inadimplência: ',
100 * np.mean(full_model.predict(X) == 'default'))
print(
np.mean(full_train_set.outcome == 'default') /
np.mean(full_model.predict(X) == 'default'))
default_wt = 1 / np.mean(full_train_set.outcome == 'default')
wt = [default_wt if outcome == 'default' else 1 for outcome in full_train_set.outcome]
#
full_model = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
full_model.fit(X, y, wt)
#
print(
'porcentagem de empréstimos previstos para inadimplência (ponderação): ',
100 * np.mean(full_model.predict(X) == 'default')))
The package _imbalanced-learn_ provides an implementation of the _SMOTE_ and similar algorithms.
X_resampled, y_resampled = SMOTE().fit_resample(X, y)
print('percentage of loans in default (SMOTE resampled): ',
100 * np.mean(y_resampled == 'default'))
full_model = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
full_model.fit(X_resampled, y_resampled)
print('percentage of loans predicted to default (SMOTE): ',
100 * np.mean(full_model.predict(X) == 'default'))
X_resampled, y_resampled = ADASYN().fit_resample(X, y)
print('percentage of loans in default (ADASYN resampled): ',
100 * np.mean(y_resampled == 'default'))
full_model = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
full_model.fit(X_resampled, y_resampled)
print('percentage of loans predicted to default (ADASYN): ',
print( 100 * np.mean(full_model.predict(X) == 'default')))